blob: f4160e4b4ed5c114cf7d610dbd97a1569060a077 [file] [log] [blame]
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -08001/* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3/*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
Felicia Lim0c2090c2017-07-05 17:36:56 -070032#define ANALYSIS_C
33
34#include <stdio.h>
35
36#include "mathops.h"
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -080037#include "kiss_fft.h"
38#include "celt.h"
39#include "modes.h"
40#include "arch.h"
41#include "quant_bands.h"
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -080042#include "analysis.h"
43#include "mlp.h"
44#include "stack_alloc.h"
Felicia Lim0c2090c2017-07-05 17:36:56 -070045#include "float_cast.h"
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -080046
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -080047#ifndef M_PI
48#define M_PI 3.141592653
49#endif
50
Felicia Lim0c2090c2017-07-05 17:36:56 -070051#ifndef DISABLE_FLOAT_API
52
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -080053static const float dct_table[128] = {
54 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
55 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
56 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
57 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
58 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
59 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
60 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
61 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
62 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
63 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
64 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
65 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
66 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
67 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
68 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
69 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
70};
71
72static const float analysis_window[240] = {
73 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
74 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
75 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
76 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
77 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
78 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
79 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
80 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
81 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
82 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
83 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
84 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
85 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
86 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
87 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
88 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
89 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
90 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
91 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
92 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
93 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
94 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
95 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
96 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
97 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
98 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
99 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
100 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
101 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
102 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
103};
104
105static const int tbands[NB_TBANDS+1] = {
Felicia Lim0c2090c2017-07-05 17:36:56 -0700106 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800107};
108
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800109#define NB_TONAL_SKIP_BANDS 9
110
Felicia Lim0c2090c2017-07-05 17:36:56 -0700111static opus_val32 silk_resampler_down2_hp(
112 opus_val32 *S, /* I/O State vector [ 2 ] */
113 opus_val32 *out, /* O Output signal [ floor(len/2) ] */
114 const opus_val32 *in, /* I Input signal [ len ] */
115 int inLen /* I Number of input samples */
116)
117{
118 int k, len2 = inLen/2;
119 opus_val32 in32, out32, out32_hp, Y, X;
120 opus_val64 hp_ener = 0;
121 /* Internal variables and state are in Q10 format */
122 for( k = 0; k < len2; k++ ) {
123 /* Convert to Q10 */
124 in32 = in[ 2 * k ];
125
126 /* All-pass section for even input sample */
127 Y = SUB32( in32, S[ 0 ] );
128 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
129 out32 = ADD32( S[ 0 ], X );
130 S[ 0 ] = ADD32( in32, X );
131 out32_hp = out32;
132 /* Convert to Q10 */
133 in32 = in[ 2 * k + 1 ];
134
135 /* All-pass section for odd input sample, and add to output of previous section */
136 Y = SUB32( in32, S[ 1 ] );
137 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
138 out32 = ADD32( out32, S[ 1 ] );
139 out32 = ADD32( out32, X );
140 S[ 1 ] = ADD32( in32, X );
141
142 Y = SUB32( -in32, S[ 2 ] );
143 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
144 out32_hp = ADD32( out32_hp, S[ 2 ] );
145 out32_hp = ADD32( out32_hp, X );
146 S[ 2 ] = ADD32( -in32, X );
147
148 hp_ener += out32_hp*(opus_val64)out32_hp;
149 /* Add, convert back to int16 and store to output */
150 out[ k ] = HALF32(out32);
151 }
152#ifdef FIXED_POINT
153 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
154 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
155#endif
156 return (opus_val32)hp_ener;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800157}
158
Felicia Lim0c2090c2017-07-05 17:36:56 -0700159static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
160{
161 VARDECL(opus_val32, tmp);
162 opus_val32 scale;
163 int j;
164 opus_val32 ret = 0;
165 SAVE_STACK;
166
167 if (subframe==0) return 0;
168 if (Fs == 48000)
169 {
170 subframe *= 2;
171 offset *= 2;
172 } else if (Fs == 16000) {
173 subframe = subframe*2/3;
174 offset = offset*2/3;
175 }
176 ALLOC(tmp, subframe, opus_val32);
177
178 downmix(_x, tmp, subframe, offset, c1, c2, C);
179#ifdef FIXED_POINT
180 scale = (1<<SIG_SHIFT);
181#else
182 scale = 1.f/32768;
183#endif
184 if (c2==-2)
185 scale /= C;
186 else if (c2>-1)
187 scale /= 2;
188 for (j=0;j<subframe;j++)
189 tmp[j] *= scale;
190 if (Fs == 48000)
191 {
192 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
193 } else if (Fs == 24000) {
194 OPUS_COPY(y, tmp, subframe);
195 } else if (Fs == 16000) {
196 VARDECL(opus_val32, tmp3x);
197 ALLOC(tmp3x, 3*subframe, opus_val32);
198 /* Don't do this at home! This resampler is horrible and it's only (barely)
199 usable for the purpose of the analysis because we don't care about all
200 the aliasing between 8 kHz and 12 kHz. */
201 for (j=0;j<subframe;j++)
202 {
203 tmp3x[3*j] = tmp[j];
204 tmp3x[3*j+1] = tmp[j];
205 tmp3x[3*j+2] = tmp[j];
206 }
207 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
208 }
209 RESTORE_STACK;
210 return ret;
211}
212
213void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
flimc91ee5b2016-01-26 14:33:44 +0100214{
215 /* Initialize reusable fields. */
216 tonal->arch = opus_select_arch();
Felicia Lim0c2090c2017-07-05 17:36:56 -0700217 tonal->Fs = Fs;
flimc91ee5b2016-01-26 14:33:44 +0100218 /* Clear remaining fields. */
219 tonality_analysis_reset(tonal);
220}
221
222void tonality_analysis_reset(TonalityAnalysisState *tonal)
223{
224 /* Clear non-reusable fields. */
225 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
226 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
Felicia Lim0c2090c2017-07-05 17:36:56 -0700227 tonal->music_confidence = .9f;
228 tonal->speech_confidence = .1f;
flimc91ee5b2016-01-26 14:33:44 +0100229}
230
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800231void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
232{
233 int pos;
234 int curr_lookahead;
235 float psum;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700236 float tonality_max;
237 float tonality_avg;
238 int tonality_count;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800239 int i;
240
241 pos = tonal->read_pos;
242 curr_lookahead = tonal->write_pos-tonal->read_pos;
243 if (curr_lookahead<0)
244 curr_lookahead += DETECT_SIZE;
245
Felicia Lim0c2090c2017-07-05 17:36:56 -0700246 /* On long frames, look at the second analysis window rather than the first. */
247 if (len > tonal->Fs/50 && pos != tonal->write_pos)
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800248 {
249 pos++;
250 if (pos==DETECT_SIZE)
251 pos=0;
252 }
253 if (pos == tonal->write_pos)
254 pos--;
255 if (pos<0)
256 pos = DETECT_SIZE-1;
257 OPUS_COPY(info_out, &tonal->info[pos], 1);
Felicia Lim0c2090c2017-07-05 17:36:56 -0700258 tonality_max = tonality_avg = info_out->tonality;
259 tonality_count = 1;
260 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
261 for (i=0;i<3;i++)
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800262 {
Felicia Lim0c2090c2017-07-05 17:36:56 -0700263 pos++;
264 if (pos==DETECT_SIZE)
265 pos = 0;
266 if (pos == tonal->write_pos)
267 break;
268 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
269 tonality_avg += tonal->info[pos].tonality;
270 tonality_count++;
271 }
272 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
273 tonal->read_subframe += len/(tonal->Fs/400);
274 while (tonal->read_subframe>=8)
275 {
276 tonal->read_subframe -= 8;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800277 tonal->read_pos++;
278 }
279 if (tonal->read_pos>=DETECT_SIZE)
280 tonal->read_pos-=DETECT_SIZE;
281
Felicia Lim0c2090c2017-07-05 17:36:56 -0700282 /* The -1 is to compensate for the delay in the features themselves. */
283 curr_lookahead = IMAX(curr_lookahead-1, 0);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800284
285 psum=0;
286 /* Summing the probability of transition patterns that involve music at
287 time (DETECT_SIZE-curr_lookahead-1) */
288 for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
289 psum += tonal->pmusic[i];
290 for (;i<DETECT_SIZE;i++)
291 psum += tonal->pspeech[i];
292 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700293 /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800294
295 info_out->music_prob = psum;
296}
297
Felicia Lim0c2090c2017-07-05 17:36:56 -0700298static const float std_feature_bias[9] = {
299 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
300 2.163313f, 1.260756f, 1.116868f, 1.918795f
301};
302
303#define LEAKAGE_OFFSET 2.5f
304#define LEAKAGE_SLOPE 2.f
305
306#ifdef FIXED_POINT
307/* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
308 compensate for that in the energy. */
309#define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
310#define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
311#else
312#define SCALE_ENER(e) (e)
313#endif
314
flimc91ee5b2016-01-26 14:33:44 +0100315static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800316{
317 int i, b;
318 const kiss_fft_state *kfft;
319 VARDECL(kiss_fft_cpx, in);
320 VARDECL(kiss_fft_cpx, out);
321 int N = 480, N2=240;
322 float * OPUS_RESTRICT A = tonal->angle;
323 float * OPUS_RESTRICT dA = tonal->d_angle;
324 float * OPUS_RESTRICT d2A = tonal->d2_angle;
325 VARDECL(float, tonality);
326 VARDECL(float, noisiness);
327 float band_tonality[NB_TBANDS];
328 float logE[NB_TBANDS];
329 float BFCC[8];
330 float features[25];
331 float frame_tonality;
332 float max_frame_tonality;
333 /*float tw_sum=0;*/
334 float frame_noisiness;
335 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
336 float slope=0;
337 float frame_stationarity;
338 float relativeE;
339 float frame_probs[2];
340 float alpha, alphaE, alphaE2;
341 float frame_loudness;
342 float bandwidth_mask;
343 int bandwidth=0;
344 float maxE = 0;
345 float noise_floor;
346 int remaining;
347 AnalysisInfo *info;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700348 float hp_ener;
349 float tonality2[240];
350 float midE[8];
351 float spec_variability=0;
352 float band_log2[NB_TBANDS+1];
353 float leakage_from[NB_TBANDS+1];
354 float leakage_to[NB_TBANDS+1];
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800355 SAVE_STACK;
356
Felicia Lim0c2090c2017-07-05 17:36:56 -0700357 alpha = 1.f/IMIN(10, 1+tonal->count);
358 alphaE = 1.f/IMIN(25, 1+tonal->count);
359 alphaE2 = 1.f/IMIN(500, 1+tonal->count);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800360
Felicia Lim0c2090c2017-07-05 17:36:56 -0700361 if (tonal->Fs == 48000)
362 {
363 /* len and offset are now at 24 kHz. */
364 len/= 2;
365 offset /= 2;
366 } else if (tonal->Fs == 16000) {
367 len = 3*len/2;
368 offset = 3*offset/2;
369 }
370
371 if (tonal->count<4) {
372 if (tonal->application == OPUS_APPLICATION_VOIP)
373 tonal->music_prob = .1f;
374 else
375 tonal->music_prob = .625f;
376 }
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800377 kfft = celt_mode->mdct.kfft[0];
378 if (tonal->count==0)
379 tonal->mem_fill = 240;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700380 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
381 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
382 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800383 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
384 {
385 tonal->mem_fill += len;
386 /* Don't have enough to update the analysis */
387 RESTORE_STACK;
388 return;
389 }
Felicia Lim0c2090c2017-07-05 17:36:56 -0700390 hp_ener = tonal->hp_ener_accum;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800391 info = &tonal->info[tonal->write_pos++];
392 if (tonal->write_pos>=DETECT_SIZE)
393 tonal->write_pos-=DETECT_SIZE;
394
395 ALLOC(in, 480, kiss_fft_cpx);
396 ALLOC(out, 480, kiss_fft_cpx);
397 ALLOC(tonality, 240, float);
398 ALLOC(noisiness, 240, float);
399 for (i=0;i<N2;i++)
400 {
401 float w = analysis_window[i];
402 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
403 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
404 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
405 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
406 }
407 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
408 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
Felicia Lim0c2090c2017-07-05 17:36:56 -0700409 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
410 &tonal->inmem[240], tonal->downmix_state, remaining,
411 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800412 tonal->mem_fill = 240 + remaining;
flimc91ee5b2016-01-26 14:33:44 +0100413 opus_fft(kfft, in, out, tonal->arch);
414#ifndef FIXED_POINT
415 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
416 if (celt_isnan(out[0].r))
417 {
418 info->valid = 0;
419 RESTORE_STACK;
420 return;
421 }
422#endif
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800423
424 for (i=1;i<N2;i++)
425 {
426 float X1r, X2r, X1i, X2i;
427 float angle, d_angle, d2_angle;
428 float angle2, d_angle2, d2_angle2;
429 float mod1, mod2, avg_mod;
430 X1r = (float)out[i].r+out[N-i].r;
431 X1i = (float)out[i].i-out[N-i].i;
432 X2r = (float)out[i].i+out[N-i].i;
433 X2i = (float)out[N-i].r-out[i].r;
434
435 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
436 d_angle = angle - A[i];
437 d2_angle = d_angle - dA[i];
438
439 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
440 d_angle2 = angle2 - angle;
441 d2_angle2 = d_angle2 - d_angle;
442
Felicia Lim0c2090c2017-07-05 17:36:56 -0700443 mod1 = d2_angle - (float)float2int(d2_angle);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800444 noisiness[i] = ABS16(mod1);
445 mod1 *= mod1;
446 mod1 *= mod1;
447
Felicia Lim0c2090c2017-07-05 17:36:56 -0700448 mod2 = d2_angle2 - (float)float2int(d2_angle2);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800449 noisiness[i] += ABS16(mod2);
450 mod2 *= mod2;
451 mod2 *= mod2;
452
Felicia Lim0c2090c2017-07-05 17:36:56 -0700453 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
454 /* This introduces an extra delay of 2 frames in the detection. */
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800455 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700456 /* No delay on this detection, but it's less reliable. */
457 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800458
459 A[i] = angle2;
460 dA[i] = d_angle2;
461 d2A[i] = mod2;
462 }
Felicia Lim0c2090c2017-07-05 17:36:56 -0700463 for (i=2;i<N2-1;i++)
464 {
465 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
466 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
467 }
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800468 frame_tonality = 0;
469 max_frame_tonality = 0;
470 /*tw_sum = 0;*/
471 info->activity = 0;
472 frame_noisiness = 0;
473 frame_stationarity = 0;
474 if (!tonal->count)
475 {
476 for (b=0;b<NB_TBANDS;b++)
477 {
478 tonal->lowE[b] = 1e10;
479 tonal->highE[b] = -1e10;
480 }
481 }
482 relativeE = 0;
483 frame_loudness = 0;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700484 /* The energy of the very first band is special because of DC. */
485 {
486 float E = 0;
487 float X1r, X2r;
488 X1r = 2*(float)out[0].r;
489 X2r = 2*(float)out[0].i;
490 E = X1r*X1r + X2r*X2r;
491 for (i=1;i<4;i++)
492 {
493 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
494 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
495 E += binE;
496 }
497 E = SCALE_ENER(E);
498 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
499 }
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800500 for (b=0;b<NB_TBANDS;b++)
501 {
502 float E=0, tE=0, nE=0;
503 float L1, L2;
504 float stationarity;
505 for (i=tbands[b];i<tbands[b+1];i++)
506 {
507 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
508 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700509 binE = SCALE_ENER(binE);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800510 E += binE;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700511 tE += binE*MAX32(0, tonality[i]);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800512 nE += binE*2.f*(.5f-noisiness[i]);
513 }
flimc91ee5b2016-01-26 14:33:44 +0100514#ifndef FIXED_POINT
515 /* Check for extreme band energies that could cause NaNs later. */
516 if (!(E<1e9f) || celt_isnan(E))
517 {
518 info->valid = 0;
519 RESTORE_STACK;
520 return;
521 }
522#endif
523
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800524 tonal->E[tonal->E_count][b] = E;
525 frame_noisiness += nE/(1e-15f+E);
526
527 frame_loudness += (float)sqrt(E+1e-10f);
528 logE[b] = (float)log(E+1e-10f);
Felicia Lim0c2090c2017-07-05 17:36:56 -0700529 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
530 tonal->logE[tonal->E_count][b] = logE[b];
531 if (tonal->count==0)
532 tonal->highE[b] = tonal->lowE[b] = logE[b];
533 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800534 {
Felicia Lim0c2090c2017-07-05 17:36:56 -0700535 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
536 tonal->highE[b] -= .01f;
537 else
538 tonal->lowE[b] += .01f;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800539 }
Felicia Lim0c2090c2017-07-05 17:36:56 -0700540 if (logE[b] > tonal->highE[b])
541 {
542 tonal->highE[b] = logE[b];
543 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
544 } else if (logE[b] < tonal->lowE[b])
545 {
546 tonal->lowE[b] = logE[b];
547 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
548 }
549 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800550
551 L1=L2=0;
552 for (i=0;i<NB_FRAMES;i++)
553 {
554 L1 += (float)sqrt(tonal->E[i][b]);
555 L2 += tonal->E[i][b];
556 }
557
558 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
559 stationarity *= stationarity;
560 stationarity *= stationarity;
561 frame_stationarity += stationarity;
562 /*band_tonality[b] = tE/(1e-15+E)*/;
563 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
564#if 0
565 if (b>=NB_TONAL_SKIP_BANDS)
566 {
567 frame_tonality += tweight[b]*band_tonality[b];
568 tw_sum += tweight[b];
569 }
570#else
571 frame_tonality += band_tonality[b];
572 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
573 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
574#endif
575 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
576 slope += band_tonality[b]*(b-8);
577 /*printf("%f %f ", band_tonality[b], stationarity);*/
578 tonal->prev_band_tonality[b] = band_tonality[b];
579 }
580
Felicia Lim0c2090c2017-07-05 17:36:56 -0700581 leakage_from[0] = band_log2[0];
582 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
583 for (b=1;b<NB_TBANDS+1;b++)
584 {
585 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
586 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
587 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
588 }
589 for (b=NB_TBANDS-2;b>=0;b--)
590 {
591 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
592 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
593 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
594 }
595 celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
596 for (b=0;b<NB_TBANDS+1;b++)
597 {
598 /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
599 represents the boost needed to overcome the amount of analysis leakage
600 cause in a weaker band b by louder neighbouring bands.
601 The second, based on leakage_from[], applies to a loud band b for
602 which the quantization noise causes synthesis leakage to the weaker
603 neighbouring bands. */
604 float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
605 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
606 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
607 }
608 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
609
610 for (i=0;i<NB_FRAMES;i++)
611 {
612 int j;
613 float mindist = 1e15f;
614 for (j=0;j<NB_FRAMES;j++)
615 {
616 int k;
617 float dist=0;
618 for (k=0;k<NB_TBANDS;k++)
619 {
620 float tmp;
621 tmp = tonal->logE[i][k] - tonal->logE[j][k];
622 dist += tmp*tmp;
623 }
624 if (j!=i)
625 mindist = MIN32(mindist, dist);
626 }
627 spec_variability += mindist;
628 }
629 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800630 bandwidth_mask = 0;
631 bandwidth = 0;
632 maxE = 0;
633 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800634 noise_floor *= noise_floor;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700635 for (b=0;b<NB_TBANDS;b++)
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800636 {
637 float E=0;
638 int band_start, band_end;
639 /* Keep a margin of 300 Hz for aliasing */
Felicia Lim0c2090c2017-07-05 17:36:56 -0700640 band_start = tbands[b];
641 band_end = tbands[b+1];
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800642 for (i=band_start;i<band_end;i++)
643 {
644 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
645 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
646 E += binE;
647 }
Felicia Lim0c2090c2017-07-05 17:36:56 -0700648 E = SCALE_ENER(E);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800649 maxE = MAX32(maxE, E);
650 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
651 E = MAX32(E, tonal->meanE[b]);
652 /* Use a simple follower with 13 dB/Bark slope for spreading function */
653 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
654 /* Consider the band "active" only if all these conditions are met:
655 1) less than 10 dB below the simple follower
656 2) less than 90 dB below the peak band (maximal masking possible considering
657 both the ATH and the loudness-dependent slope of the spreading function)
658 3) above the PCM quantization noise floor
Felicia Lim0c2090c2017-07-05 17:36:56 -0700659 We use b+1 because the first CELT band isn't included in tbands[]
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800660 */
661 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
Felicia Lim0c2090c2017-07-05 17:36:56 -0700662 bandwidth = b+1;
663 }
664 /* Special case for the last two bands, for which we don't have spectrum but only
665 the energy above 12 kHz. */
666 if (tonal->Fs == 48000) {
667 float ratio;
668 float E = hp_ener*(1.f/(240*240));
669 ratio = tonal->prev_bandwidth==20 ? 0.03f : 0.07f;
670#ifdef FIXED_POINT
671 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
672 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
673#endif
674 maxE = MAX32(maxE, E);
675 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
676 E = MAX32(E, tonal->meanE[b]);
677 /* Use a simple follower with 13 dB/Bark slope for spreading function */
678 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
679 if (E>ratio*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)
680 bandwidth = 20;
681 /* This detector is unreliable, so if the bandwidth is close to SWB, assume it's FB. */
682 if (bandwidth >= 17)
683 bandwidth = 20;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800684 }
685 if (tonal->count<=2)
686 bandwidth = 20;
687 frame_loudness = 20*(float)log10(frame_loudness);
Felicia Lim0c2090c2017-07-05 17:36:56 -0700688 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800689 tonal->lowECount *= (1-alphaE);
690 if (frame_loudness < tonal->Etracker-30)
691 tonal->lowECount += alphaE;
692
693 for (i=0;i<8;i++)
694 {
695 float sum=0;
696 for (b=0;b<16;b++)
697 sum += dct_table[i*16+b]*logE[b];
698 BFCC[i] = sum;
699 }
Felicia Lim0c2090c2017-07-05 17:36:56 -0700700 for (i=0;i<8;i++)
701 {
702 float sum=0;
703 for (b=0;b<16;b++)
704 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
705 midE[i] = sum;
706 }
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800707
708 frame_stationarity /= NB_TBANDS;
709 relativeE /= NB_TBANDS;
710 if (tonal->count<10)
Felicia Lim0c2090c2017-07-05 17:36:56 -0700711 relativeE = .5f;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800712 frame_noisiness /= NB_TBANDS;
713#if 1
714 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
715#else
716 info->activity = .5*(1+frame_noisiness-frame_stationarity);
717#endif
718 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
719 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
720 tonal->prev_tonality = frame_tonality;
721
722 slope /= 8*8;
723 info->tonality_slope = slope;
724
725 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700726 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800727 info->tonality = frame_tonality;
728
729 for (i=0;i<4;i++)
730 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
731
732 for (i=0;i<4;i++)
733 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
734
735 for (i=0;i<4;i++)
736 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
737 for (i=0;i<3;i++)
738 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
739
740 if (tonal->count > 5)
741 {
742 for (i=0;i<9;i++)
743 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
744 }
Felicia Lim0c2090c2017-07-05 17:36:56 -0700745 for (i=0;i<4;i++)
746 features[i] = BFCC[i]-midE[i];
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800747
748 for (i=0;i<8;i++)
749 {
750 tonal->mem[i+24] = tonal->mem[i+16];
751 tonal->mem[i+16] = tonal->mem[i+8];
752 tonal->mem[i+8] = tonal->mem[i];
753 tonal->mem[i] = BFCC[i];
754 }
755 for (i=0;i<9;i++)
Felicia Lim0c2090c2017-07-05 17:36:56 -0700756 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
757 features[18] = spec_variability - 0.78f;
758 features[20] = info->tonality - 0.154723f;
759 features[21] = info->activity - 0.724643f;
760 features[22] = frame_stationarity - 0.743717f;
761 features[23] = info->tonality_slope + 0.069216f;
762 features[24] = tonal->lowECount - 0.067930f;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800763
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800764 mlp_process(&net, features, frame_probs);
765 frame_probs[0] = .5f*(frame_probs[0]+1);
766 /* Curve fitting between the MLP probability and the actual probability */
Felicia Lim0c2090c2017-07-05 17:36:56 -0700767 /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800768 /* Probability of active audio (as opposed to silence) */
769 frame_probs[1] = .5f*frame_probs[1]+.5f;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700770 frame_probs[1] *= frame_probs[1];
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800771
Felicia Lim0c2090c2017-07-05 17:36:56 -0700772 /* Probability of speech or music vs noise */
773 info->activity_probability = frame_probs[1];
774
775 /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800776 {
777 /* Probability of state transition */
778 float tau;
779 /* Represents independence of the MLP probabilities, where
780 beta=1 means fully independent. */
781 float beta;
782 /* Denormalized probability of speech (p0) and music (p1) after update */
783 float p0, p1;
784 /* Probabilities for "all speech" and "all music" */
785 float s0, m0;
786 /* Probability sum for renormalisation */
787 float psum;
788 /* Instantaneous probability of speech and music, with beta pre-applied. */
789 float speech0;
790 float music0;
Felicia Limd03c3732016-07-25 20:28:37 +0200791 float p, q;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800792
Felicia Lim0c2090c2017-07-05 17:36:56 -0700793 /* More silence transitions for speech than for music. */
794 tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
795 p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
796 q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
797 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
798 /* p0 and p1 are the probabilities of speech and music at this frame
799 using only information from previous frame and applying the
800 state transition model */
801 p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau;
802 p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
803 /* We apply the current probability with exponent beta to work around
804 the fact that the probability estimates aren't independent. */
805 p0 *= (float)pow(1-frame_probs[1], beta);
806 p1 *= (float)pow(frame_probs[1], beta);
807 /* Normalise the probabilities to get the Marokv probability of music. */
808 tonal->vad_prob = p1/(p0+p1);
809 info->vad_prob = tonal->vad_prob;
810 /* Consider that silence has a 50-50 probability of being speech or music. */
811 frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;
812
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800813 /* One transition every 3 minutes of active audio */
Felicia Lim0c2090c2017-07-05 17:36:56 -0700814 tau = .0001f;
Felicia Limd03c3732016-07-25 20:28:37 +0200815 /* Adapt beta based on how "unexpected" the new prob is */
816 p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
817 q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
Felicia Lim0c2090c2017-07-05 17:36:56 -0700818 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800819 /* p0 and p1 are the probabilities of speech and music at this frame
820 using only information from previous frame and applying the
821 state transition model */
822 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
823 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
824 /* We apply the current probability with exponent beta to work around
825 the fact that the probability estimates aren't independent. */
826 p0 *= (float)pow(1-frame_probs[0], beta);
827 p1 *= (float)pow(frame_probs[0], beta);
828 /* Normalise the probabilities to get the Marokv probability of music. */
829 tonal->music_prob = p1/(p0+p1);
830 info->music_prob = tonal->music_prob;
831
Felicia Lim0c2090c2017-07-05 17:36:56 -0700832 /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800833 /* This chunk of code deals with delayed decision. */
834 psum=1e-20f;
835 /* Instantaneous probability of speech and music, with beta pre-applied. */
836 speech0 = (float)pow(1-frame_probs[0], beta);
837 music0 = (float)pow(frame_probs[0], beta);
838 if (tonal->count==1)
839 {
Felicia Lim0c2090c2017-07-05 17:36:56 -0700840 if (tonal->application == OPUS_APPLICATION_VOIP)
841 tonal->pmusic[0] = .1f;
842 else
843 tonal->pmusic[0] = .625f;
844 tonal->pspeech[0] = 1-tonal->pmusic[0];
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800845 }
846 /* Updated probability of having only speech (s0) or only music (m0),
847 before considering the new observation. */
848 s0 = tonal->pspeech[0] + tonal->pspeech[1];
849 m0 = tonal->pmusic [0] + tonal->pmusic [1];
850 /* Updates s0 and m0 with instantaneous probability. */
851 tonal->pspeech[0] = s0*(1-tau)*speech0;
852 tonal->pmusic [0] = m0*(1-tau)*music0;
853 /* Propagate the transition probabilities */
854 for (i=1;i<DETECT_SIZE-1;i++)
855 {
856 tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
857 tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
858 }
859 /* Probability that the latest frame is speech, when all the previous ones were music. */
860 tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
861 /* Probability that the latest frame is music, when all the previous ones were speech. */
862 tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
863
864 /* Renormalise probabilities to 1 */
865 for (i=0;i<DETECT_SIZE;i++)
866 psum += tonal->pspeech[i] + tonal->pmusic[i];
867 psum = 1.f/psum;
868 for (i=0;i<DETECT_SIZE;i++)
869 {
870 tonal->pspeech[i] *= psum;
871 tonal->pmusic [i] *= psum;
872 }
873 psum = tonal->pmusic[0];
874 for (i=1;i<DETECT_SIZE;i++)
875 psum += tonal->pspeech[i];
876
877 /* Estimate our confidence in the speech/music decisions */
878 if (frame_probs[1]>.75)
879 {
880 if (tonal->music_prob>.9)
881 {
882 float adapt;
883 adapt = 1.f/(++tonal->music_confidence_count);
884 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
885 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
886 }
887 if (tonal->music_prob<.1)
888 {
889 float adapt;
890 adapt = 1.f/(++tonal->speech_confidence_count);
891 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
892 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
893 }
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800894 }
895 }
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800896 tonal->last_music = tonal->music_prob>.5f;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700897#ifdef MLP_TRAINING
898 for (i=0;i<25;i++)
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800899 printf("%f ", features[i]);
Felicia Lim0c2090c2017-07-05 17:36:56 -0700900 printf("\n");
901#endif
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800902
903 info->bandwidth = bandwidth;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700904 tonal->prev_bandwidth = bandwidth;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800905 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
906 info->noisiness = frame_noisiness;
907 info->valid = 1;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800908 RESTORE_STACK;
909}
910
911void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
912 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
913 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
914{
915 int offset;
916 int pcm_len;
917
Felicia Lim0c2090c2017-07-05 17:36:56 -0700918 analysis_frame_size -= analysis_frame_size&1;
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800919 if (analysis_pcm != NULL)
920 {
921 /* Avoid overflow/wrap-around of the analysis buffer */
Felicia Lim0c2090c2017-07-05 17:36:56 -0700922 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800923
924 pcm_len = analysis_frame_size - analysis->analysis_offset;
925 offset = analysis->analysis_offset;
Felicia Lim0c2090c2017-07-05 17:36:56 -0700926 while (pcm_len>0) {
927 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
928 offset += Fs/50;
929 pcm_len -= Fs/50;
930 }
Vignesh Venkatasubramanian2bd8b542014-02-20 10:50:35 -0800931 analysis->analysis_offset = analysis_frame_size;
932
933 analysis->analysis_offset -= frame_size;
934 }
935
936 analysis_info->valid = 0;
937 tonality_get_info(analysis, analysis_info, frame_size);
938}
Felicia Lim0c2090c2017-07-05 17:36:56 -0700939
940#endif /* DISABLE_FLOAT_API */