blob: 63d3b7325b8477a025375939c2618e513a3e772f [file] [log] [blame]
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -05001/* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3/*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
Jean-Marc Valin2ff65562016-08-11 11:05:51 -040032#define ANALYSIS_C
33
34#include <stdio.h>
35
36#include "mathops.h"
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -050037#include "kiss_fft.h"
38#include "celt.h"
39#include "modes.h"
40#include "arch.h"
41#include "quant_bands.h"
Jean-Marc Valinc5880fe2012-07-05 01:28:45 -040042#include "analysis.h"
43#include "mlp.h"
Jean-Marc Valina8783a12013-04-15 15:49:40 -040044#include "stack_alloc.h"
Jean-Marc Valincf9409f2016-12-16 17:52:15 -050045#include "float_cast.h"
Jean-Marc Valinc5880fe2012-07-05 01:28:45 -040046
Jean-Marc Valin73eb3632011-11-16 10:47:15 +080047#ifndef M_PI
48#define M_PI 3.141592653
49#endif
50
Mark Harrisd6d70372017-02-20 19:51:40 -080051#ifndef DISABLE_FLOAT_API
52
Jean-Marc Valin027ff072012-09-14 15:49:18 -040053static const float dct_table[128] = {
Jean-Marc Valind683c762012-12-21 16:17:38 -050054 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
55 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
56 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
57 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
58 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
59 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
60 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
61 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
62 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
63 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
64 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
65 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
66 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
67 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
68 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
69 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
Jean-Marc Valin73eb3632011-11-16 10:47:15 +080070};
71
Jean-Marc Valin027ff072012-09-14 15:49:18 -040072static const float analysis_window[240] = {
Jean-Marc Valinb15b30c2012-07-04 12:08:22 -040073 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
74 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
75 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
76 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
77 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
78 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
79 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
80 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
81 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
82 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
83 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
84 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
85 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
86 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
87 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
88 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
89 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
90 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
91 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
92 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
93 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
94 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
95 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
96 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
97 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
98 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
99 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
100 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
101 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
102 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
103};
104
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500105static const int tbands[NB_TBANDS+1] = {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500106 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500107};
108
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500109#define NB_TONAL_SKIP_BANDS 9
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800110
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500111static opus_val32 silk_resampler_down2_hp(
112 opus_val32 *S, /* I/O State vector [ 2 ] */
113 opus_val32 *out, /* O Output signal [ floor(len/2) ] */
114 const opus_val32 *in, /* I Input signal [ len ] */
115 int inLen /* I Number of input samples */
116)
117{
118 int k, len2 = inLen/2;
119 opus_val32 in32, out32, out32_hp, Y, X;
120 opus_val64 hp_ener = 0;
121 /* Internal variables and state are in Q10 format */
122 for( k = 0; k < len2; k++ ) {
123 /* Convert to Q10 */
124 in32 = in[ 2 * k ];
Jean-Marc Valinb15b30c2012-07-04 12:08:22 -0400125
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500126 /* All-pass section for even input sample */
127 Y = SUB32( in32, S[ 0 ] );
128 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
129 out32 = ADD32( S[ 0 ], X );
130 S[ 0 ] = ADD32( in32, X );
131 out32_hp = out32;
132 /* Convert to Q10 */
133 in32 = in[ 2 * k + 1 ];
134
135 /* All-pass section for odd input sample, and add to output of previous section */
136 Y = SUB32( in32, S[ 1 ] );
137 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
138 out32 = ADD32( out32, S[ 1 ] );
139 out32 = ADD32( out32, X );
140 S[ 1 ] = ADD32( in32, X );
141
142 Y = SUB32( -in32, S[ 2 ] );
143 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
144 out32_hp = ADD32( out32_hp, S[ 2 ] );
145 out32_hp = ADD32( out32_hp, X );
146 S[ 2 ] = ADD32( -in32, X );
147
148 hp_ener += out32_hp*(opus_val64)out32_hp;
149 /* Add, convert back to int16 and store to output */
150 out[ k ] = HALF32(out32);
151 }
152#ifdef FIXED_POINT
153 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
154 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
155#endif
Mark Harrisd6d70372017-02-20 19:51:40 -0800156 return (opus_val32)hp_ener;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500157}
158
159static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
160{
161 VARDECL(opus_val32, tmp);
162 opus_val32 scale;
163 int j;
164 opus_val32 ret = 0;
165 SAVE_STACK;
166
167 if (subframe==0) return 0;
168 if (Fs == 48000)
169 {
170 subframe *= 2;
171 offset *= 2;
172 } else if (Fs == 16000) {
173 subframe = subframe*2/3;
174 offset = offset*2/3;
175 }
176 ALLOC(tmp, subframe, opus_val32);
177
178 downmix(_x, tmp, subframe, offset, c1, c2, C);
179#ifdef FIXED_POINT
180 scale = (1<<SIG_SHIFT);
181#else
182 scale = 1.f/32768;
183#endif
184 if (c2==-2)
185 scale /= C;
186 else if (c2>-1)
187 scale /= 2;
188 for (j=0;j<subframe;j++)
189 tmp[j] *= scale;
190 if (Fs == 48000)
191 {
192 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
193 } else if (Fs == 24000) {
194 OPUS_COPY(y, tmp, subframe);
195 } else if (Fs == 16000) {
196 VARDECL(opus_val32, tmp3x);
197 ALLOC(tmp3x, 3*subframe, opus_val32);
198 /* Don't do this at home! This resampler is horrible and it's only (barely)
199 usable for the purpose of the analysis because we don't care about all
200 the aliasing between 8 kHz and 12 kHz. */
201 for (j=0;j<subframe;j++)
202 {
203 tmp3x[3*j] = tmp[j];
204 tmp3x[3*j+1] = tmp[j];
205 tmp3x[3*j+2] = tmp[j];
206 }
207 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
208 }
209 RESTORE_STACK;
210 return ret;
211}
212
213void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
Ralph Gilesd43445f2015-12-30 10:00:17 -0800214{
215 /* Initialize reusable fields. */
216 tonal->arch = opus_select_arch();
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500217 tonal->Fs = Fs;
Ralph Giles9e3872a2015-12-30 15:27:02 -0800218 /* Clear remaining fields. */
219 tonality_analysis_reset(tonal);
220}
221
222void tonality_analysis_reset(TonalityAnalysisState *tonal)
223{
224 /* Clear non-reusable fields. */
225 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
226 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
Ralph Gilesd43445f2015-12-30 10:00:17 -0800227}
228
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500229void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
230{
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500231 int pos;
232 int curr_lookahead;
233 float psum;
234 int i;
235
236 pos = tonal->read_pos;
237 curr_lookahead = tonal->write_pos-tonal->read_pos;
238 if (curr_lookahead<0)
239 curr_lookahead += DETECT_SIZE;
240
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500241 /* On long frames, look at the second analysis window rather than the first. */
242 if (len > tonal->Fs/50 && pos != tonal->write_pos)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500243 {
244 pos++;
245 if (pos==DETECT_SIZE)
246 pos=0;
247 }
248 if (pos == tonal->write_pos)
249 pos--;
250 if (pos<0)
251 pos = DETECT_SIZE-1;
252 OPUS_COPY(info_out, &tonal->info[pos], 1);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500253 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
254 for (i=0;i<3;i++)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500255 {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500256 pos++;
257 if (pos==DETECT_SIZE)
258 pos = 0;
259 if (pos == tonal->write_pos)
260 break;
Mark Harrisd6d70372017-02-20 19:51:40 -0800261 info_out->tonality = MAX32(0, -.03f + MAX32(info_out->tonality, tonal->info[pos].tonality-.05f));
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500262 }
263 tonal->read_subframe += len/(tonal->Fs/400);
264 while (tonal->read_subframe>=8)
265 {
266 tonal->read_subframe -= 8;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500267 tonal->read_pos++;
268 }
269 if (tonal->read_pos>=DETECT_SIZE)
270 tonal->read_pos-=DETECT_SIZE;
271
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500272 /* The -1 is to compensate for the delay in the features themselves. */
273 curr_lookahead = IMAX(curr_lookahead-1, 0);
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500274
275 psum=0;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400276 /* Summing the probability of transition patterns that involve music at
277 time (DETECT_SIZE-curr_lookahead-1) */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500278 for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
279 psum += tonal->pmusic[i];
280 for (;i<DETECT_SIZE;i++)
281 psum += tonal->pspeech[i];
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500282 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500283 /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500284
285 info_out->music_prob = psum;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500286}
287
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500288static const float std_feature_bias[9] = {
Mark Harrisd6d70372017-02-20 19:51:40 -0800289 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
290 2.163313f, 1.260756f, 1.116868f, 1.918795f
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500291};
292
Ralph Gilesd43445f2015-12-30 10:00:17 -0800293static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500294{
295 int i, b;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500296 const kiss_fft_state *kfft;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400297 VARDECL(kiss_fft_cpx, in);
298 VARDECL(kiss_fft_cpx, out);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500299 int N = 480, N2=240;
Joshua Bowman3b60e812012-10-23 12:18:38 -0700300 float * OPUS_RESTRICT A = tonal->angle;
301 float * OPUS_RESTRICT dA = tonal->d_angle;
302 float * OPUS_RESTRICT d2A = tonal->d2_angle;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400303 VARDECL(float, tonality);
304 VARDECL(float, noisiness);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500305 float band_tonality[NB_TBANDS];
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800306 float logE[NB_TBANDS];
307 float BFCC[8];
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400308 float features[25];
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500309 float frame_tonality;
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500310 float max_frame_tonality;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500311 /*float tw_sum=0;*/
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800312 float frame_noisiness;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500313 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500314 float slope=0;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800315 float frame_stationarity;
316 float relativeE;
Jean-Marc Valin742aac12013-02-22 16:44:56 -0500317 float frame_probs[2];
Jean-Marc Valina2054572011-11-25 23:07:46 -0500318 float alpha, alphaE, alphaE2;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500319 float frame_loudness;
Jean-Marc Valina2054572011-11-25 23:07:46 -0500320 float bandwidth_mask;
321 int bandwidth=0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500322 float maxE = 0;
323 float noise_floor;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500324 int remaining;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500325 AnalysisInfo *info;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500326 float hp_ener;
327 float tonality2[240];
328 float midE[8];
329 float spec_variability=0;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400330 SAVE_STACK;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500331
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500332 alpha = 1.f/IMIN(10, 1+tonal->count);
333 alphaE = 1.f/IMIN(25, 1+tonal->count);
334 alphaE2 = 1.f/IMIN(500, 1+tonal->count);
335
336 if (tonal->Fs == 48000)
337 {
338 /* len and offset are now at 24 kHz. */
339 len/= 2;
340 offset /= 2;
341 } else if (tonal->Fs == 16000) {
342 len = 3*len/2;
343 offset = 3*offset/2;
344 }
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500345
346 if (tonal->count<4)
347 tonal->music_prob = .5;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500348 kfft = celt_mode->mdct.kfft[0];
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500349 if (tonal->count==0)
350 tonal->mem_fill = 240;
Mark Harrisd6d70372017-02-20 19:51:40 -0800351 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
352 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500353 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500354 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
355 {
356 tonal->mem_fill += len;
357 /* Don't have enough to update the analysis */
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400358 RESTORE_STACK;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500359 return;
360 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500361 hp_ener = tonal->hp_ener_accum;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500362 info = &tonal->info[tonal->write_pos++];
363 if (tonal->write_pos>=DETECT_SIZE)
364 tonal->write_pos-=DETECT_SIZE;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500365
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400366 ALLOC(in, 480, kiss_fft_cpx);
367 ALLOC(out, 480, kiss_fft_cpx);
368 ALLOC(tonality, 240, float);
369 ALLOC(noisiness, 240, float);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500370 for (i=0;i<N2;i++)
371 {
372 float w = analysis_window[i];
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500373 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
374 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
375 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
376 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500377 }
378 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
379 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
Mark Harrisd6d70372017-02-20 19:51:40 -0800380 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
381 &tonal->inmem[240], tonal->downmix_state, remaining,
382 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500383 tonal->mem_fill = 240 + remaining;
Ralph Gilesd43445f2015-12-30 10:00:17 -0800384 opus_fft(kfft, in, out, tonal->arch);
Jean-Marc Valin122971b2013-12-10 13:55:35 -0500385#ifndef FIXED_POINT
386 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
387 if (celt_isnan(out[0].r))
388 {
389 info->valid = 0;
390 RESTORE_STACK;
391 return;
392 }
393#endif
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500394
395 for (i=1;i<N2;i++)
396 {
397 float X1r, X2r, X1i, X2i;
398 float angle, d_angle, d2_angle;
399 float angle2, d_angle2, d2_angle2;
400 float mod1, mod2, avg_mod;
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500401 X1r = (float)out[i].r+out[N-i].r;
402 X1i = (float)out[i].i-out[N-i].i;
403 X2r = (float)out[i].i+out[N-i].i;
404 X2i = (float)out[N-i].r-out[i].r;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800405
Jean-Marc Valind683c762012-12-21 16:17:38 -0500406 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500407 d_angle = angle - A[i];
408 d2_angle = d_angle - dA[i];
409
Jean-Marc Valind683c762012-12-21 16:17:38 -0500410 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500411 d_angle2 = angle2 - angle;
412 d2_angle2 = d_angle2 - d_angle;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500413
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500414 mod1 = d2_angle - (float)float2int(d2_angle);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500415 noisiness[i] = ABS16(mod1);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500416 mod1 *= mod1;
417 mod1 *= mod1;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800418
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500419 mod2 = d2_angle2 - (float)float2int(d2_angle2);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500420 noisiness[i] += ABS16(mod2);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500421 mod2 *= mod2;
422 mod2 *= mod2;
423
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500424 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
425 /* This introduces an extra delay of 2 frames in the detection. */
Jean-Marc Valind683c762012-12-21 16:17:38 -0500426 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500427 /* No delay on this detection, but it's less reliable. */
428 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500429
430 A[i] = angle2;
431 dA[i] = d_angle2;
432 d2A[i] = mod2;
433 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500434 for (i=2;i<N2-1;i++)
435 {
436 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
Mark Harrisd6d70372017-02-20 19:51:40 -0800437 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500438 }
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500439 frame_tonality = 0;
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500440 max_frame_tonality = 0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500441 /*tw_sum = 0;*/
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800442 info->activity = 0;
443 frame_noisiness = 0;
444 frame_stationarity = 0;
445 if (!tonal->count)
446 {
447 for (b=0;b<NB_TBANDS;b++)
448 {
449 tonal->lowE[b] = 1e10;
450 tonal->highE[b] = -1e10;
451 }
452 }
453 relativeE = 0;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500454 frame_loudness = 0;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500455 for (b=0;b<NB_TBANDS;b++)
456 {
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800457 float E=0, tE=0, nE=0;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500458 float L1, L2;
459 float stationarity;
460 for (i=tbands[b];i<tbands[b+1];i++)
461 {
Jean-Marc Valin3ab03e02013-09-06 16:00:39 -0400462 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
463 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
Jean-Marc Valinfc1b1f92013-09-06 16:32:50 -0400464#ifdef FIXED_POINT
465 /* FIXME: It's probably best to change the BFCC filter initial state instead */
466 binE *= 5.55e-17f;
467#endif
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500468 E += binE;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500469 tE += binE*MAX32(0, tonality[i]);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500470 nE += binE*2.f*(.5f-noisiness[i]);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500471 }
Jean-Marc Valin122971b2013-12-10 13:55:35 -0500472#ifndef FIXED_POINT
473 /* Check for extreme band energies that could cause NaNs later. */
474 if (!(E<1e9f) || celt_isnan(E))
475 {
476 info->valid = 0;
477 RESTORE_STACK;
478 return;
479 }
480#endif
481
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500482 tonal->E[tonal->E_count][b] = E;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500483 frame_noisiness += nE/(1e-15f+E);
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800484
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500485 frame_loudness += (float)sqrt(E+1e-10f);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500486 logE[b] = (float)log(E+1e-10f);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500487 tonal->logE[tonal->E_count][b] = logE[b];
488 if (tonal->count==0)
489 tonal->highE[b] = tonal->lowE[b] = logE[b];
490 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800491 {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500492 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
Mark Harrisd6d70372017-02-20 19:51:40 -0800493 tonal->highE[b] -= .01f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500494 else
Mark Harrisd6d70372017-02-20 19:51:40 -0800495 tonal->lowE[b] += .01f;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800496 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500497 if (logE[b] > tonal->highE[b])
498 {
499 tonal->highE[b] = logE[b];
500 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
501 } else if (logE[b] < tonal->lowE[b])
502 {
503 tonal->lowE[b] = logE[b];
504 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
505 }
506 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800507
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500508 L1=L2=0;
509 for (i=0;i<NB_FRAMES;i++)
510 {
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500511 L1 += (float)sqrt(tonal->E[i][b]);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500512 L2 += tonal->E[i][b];
513 }
514
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500515 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500516 stationarity *= stationarity;
517 stationarity *= stationarity;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800518 frame_stationarity += stationarity;
519 /*band_tonality[b] = tE/(1e-15+E)*/;
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500520 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500521#if 0
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800522 if (b>=NB_TONAL_SKIP_BANDS)
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500523 {
524 frame_tonality += tweight[b]*band_tonality[b];
525 tw_sum += tweight[b];
526 }
527#else
528 frame_tonality += band_tonality[b];
529 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
530 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
531#endif
Jean-Marc Valind683c762012-12-21 16:17:38 -0500532 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500533 slope += band_tonality[b]*(b-8);
Jean-Marc Valin70d90d12011-11-28 14:17:47 -0500534 /*printf("%f %f ", band_tonality[b], stationarity);*/
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500535 tonal->prev_band_tonality[b] = band_tonality[b];
536 }
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500537
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500538 for (i=0;i<NB_FRAMES;i++)
539 {
540 int j;
Mark Harrisd6d70372017-02-20 19:51:40 -0800541 float mindist = 1e15f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500542 for (j=0;j<NB_FRAMES;j++)
543 {
544 int k;
545 float dist=0;
546 for (k=0;k<NB_TBANDS;k++)
547 {
548 float tmp;
549 tmp = tonal->logE[i][k] - tonal->logE[j][k];
550 dist += tmp*tmp;
551 }
552 if (j!=i)
553 mindist = MIN32(mindist, dist);
554 }
555 spec_variability += mindist;
556 }
Mark Harrisd6d70372017-02-20 19:51:40 -0800557 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500558 bandwidth_mask = 0;
559 bandwidth = 0;
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400560 maxE = 0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500561 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
Jean-Marc Valin3ab03e02013-09-06 16:00:39 -0400562#ifdef FIXED_POINT
563 noise_floor *= 1<<(15+SIG_SHIFT);
564#endif
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500565 noise_floor *= noise_floor;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500566 for (b=0;b<NB_TBANDS;b++)
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500567 {
568 float E=0;
569 int band_start, band_end;
570 /* Keep a margin of 300 Hz for aliasing */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500571 band_start = tbands[b];
572 band_end = tbands[b+1];
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500573 for (i=band_start;i<band_end;i++)
574 {
Jean-Marc Valin3ab03e02013-09-06 16:00:39 -0400575 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
576 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500577 E += binE;
578 }
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500579 maxE = MAX32(maxE, E);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400580 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500581 E = MAX32(E, tonal->meanE[b]);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400582 /* Use a simple follower with 13 dB/Bark slope for spreading function */
Jean-Marc Valind683c762012-12-21 16:17:38 -0500583 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400584 /* Consider the band "active" only if all these conditions are met:
585 1) less than 10 dB below the simple follower
586 2) less than 90 dB below the peak band (maximal masking possible considering
587 both the ATH and the loudness-dependent slope of the spreading function)
588 3) above the PCM quantization noise floor
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500589 We use b+1 because the first CELT band isn't included in tbands[]
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400590 */
Gregory Maxwell5280c712013-07-15 15:51:24 -0700591 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500592 bandwidth = b+1;
593 }
594 /* Special case for the last two bands, for which we don't have spectrum but only
595 the energy above 12 kHz. */
596 {
Mark Harrisd6d70372017-02-20 19:51:40 -0800597 float E = hp_ener*(1.f/(240*240));
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500598#ifdef FIXED_POINT
599 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
600 E *= ((opus_int32)1 << 2*SIG_SHIFT)*256.f;
601#endif
602 maxE = MAX32(maxE, E);
603 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
604 E = MAX32(E, tonal->meanE[b]);
605 /* Use a simple follower with 13 dB/Bark slope for spreading function */
606 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
607 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)
608 bandwidth = 20;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500609 }
610 if (tonal->count<=2)
611 bandwidth = 20;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500612 frame_loudness = 20*(float)log10(frame_loudness);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500613 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500614 tonal->lowECount *= (1-alphaE);
615 if (frame_loudness < tonal->Etracker-30)
616 tonal->lowECount += alphaE;
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800617
618 for (i=0;i<8;i++)
619 {
620 float sum=0;
621 for (b=0;b<16;b++)
622 sum += dct_table[i*16+b]*logE[b];
623 BFCC[i] = sum;
624 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500625 for (i=0;i<8;i++)
626 {
627 float sum=0;
628 for (b=0;b<16;b++)
Mark Harrisd6d70372017-02-20 19:51:40 -0800629 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500630 midE[i] = sum;
631 }
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800632
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800633 frame_stationarity /= NB_TBANDS;
634 relativeE /= NB_TBANDS;
635 if (tonal->count<10)
636 relativeE = .5;
637 frame_noisiness /= NB_TBANDS;
638#if 1
639 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
640#else
641 info->activity = .5*(1+frame_noisiness-frame_stationarity);
642#endif
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500643 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
Jean-Marc Valind683c762012-12-21 16:17:38 -0500644 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500645 tonal->prev_tonality = frame_tonality;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500646
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500647 slope /= 8*8;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800648 info->tonality_slope = slope;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500649
650 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500651 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800652 info->tonality = frame_tonality;
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800653
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500654 for (i=0;i<4;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500655 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500656
Jean-Marc Valincd213ea2011-11-21 21:57:10 -0500657 for (i=0;i<4;i++)
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500658 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
659
660 for (i=0;i<4;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500661 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500662 for (i=0;i<3;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500663 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
Jean-Marc Valincd213ea2011-11-21 21:57:10 -0500664
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500665 if (tonal->count > 5)
666 {
667 for (i=0;i<9;i++)
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500668 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500669 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500670 for (i=0;i<4;i++)
671 features[i] = BFCC[i]-midE[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500672
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800673 for (i=0;i<8;i++)
674 {
675 tonal->mem[i+24] = tonal->mem[i+16];
676 tonal->mem[i+16] = tonal->mem[i+8];
677 tonal->mem[i+8] = tonal->mem[i];
678 tonal->mem[i] = BFCC[i];
679 }
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500680 for (i=0;i<9;i++)
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500681 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
Mark Harrisd6d70372017-02-20 19:51:40 -0800682 features[18] = spec_variability - 0.78f;
683 features[20] = info->tonality - 0.154723f;
684 features[21] = info->activity - 0.724643f;
685 features[22] = frame_stationarity - 0.743717f;
686 features[23] = info->tonality_slope + 0.069216f;
687 features[24] = tonal->lowECount - 0.067930f;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500688
Jean-Marc Valin742aac12013-02-22 16:44:56 -0500689 mlp_process(&net, features, frame_probs);
690 frame_probs[0] = .5f*(frame_probs[0]+1);
Jean-Marc Valin317ffc22012-10-09 02:12:02 -0400691 /* Curve fitting between the MLP probability and the actual probability */
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500692 /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400693 /* Probability of active audio (as opposed to silence) */
Jean-Marc Valine83a3652013-06-27 14:45:50 -0400694 frame_probs[1] = .5f*frame_probs[1]+.5f;
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500695 frame_probs[1] *= frame_probs[1];
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500696
Felicia Lim36481342016-05-16 15:29:53 +0200697 /* Probability of speech or music vs noise */
698 info->activity_probability = frame_probs[1];
699
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500700 /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
Jean-Marc Valin9987a3b2011-11-17 19:21:07 +0800701 {
Jean-Marc Valin0f5ff802013-07-01 18:39:34 -0400702 /* Probability of state transition */
703 float tau;
704 /* Represents independence of the MLP probabilities, where
705 beta=1 means fully independent. */
706 float beta;
707 /* Denormalized probability of speech (p0) and music (p1) after update */
Jean-Marc Valin9987a3b2011-11-17 19:21:07 +0800708 float p0, p1;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400709 /* Probabilities for "all speech" and "all music" */
Robert Meakins097fd4b2013-03-11 12:59:15 -0400710 float s0, m0;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400711 /* Probability sum for renormalisation */
Robert Meakins097fd4b2013-03-11 12:59:15 -0400712 float psum;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400713 /* Instantaneous probability of speech and music, with beta pre-applied. */
Robert Meakins097fd4b2013-03-11 12:59:15 -0400714 float speech0;
715 float music0;
Jean-Marc Valin9b1a27a2016-06-29 23:11:57 -0400716 float p, q;
Robert Meakins097fd4b2013-03-11 12:59:15 -0400717
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500718 /* More silence transitions for speech than for music. */
719 tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
720 p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
721 q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
722 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
723 /* p0 and p1 are the probabilities of speech and music at this frame
724 using only information from previous frame and applying the
725 state transition model */
726 p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau;
727 p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
728 /* We apply the current probability with exponent beta to work around
729 the fact that the probability estimates aren't independent. */
730 p0 *= (float)pow(1-frame_probs[1], beta);
731 p1 *= (float)pow(frame_probs[1], beta);
732 /* Normalise the probabilities to get the Marokv probability of music. */
733 tonal->vad_prob = p1/(p0+p1);
734 info->vad_prob = tonal->vad_prob;
735 /* Consider that silence has a 50-50 probability of being speech or music. */
736 frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;
737
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400738 /* One transition every 3 minutes of active audio */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500739 tau = .0001f;
Jean-Marc Valin9b1a27a2016-06-29 23:11:57 -0400740 /* Adapt beta based on how "unexpected" the new prob is */
741 p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
742 q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500743 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
Jean-Marc Valin0f5ff802013-07-01 18:39:34 -0400744 /* p0 and p1 are the probabilities of speech and music at this frame
745 using only information from previous frame and applying the
746 state transition model */
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500747 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
748 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
Jean-Marc Valin0f5ff802013-07-01 18:39:34 -0400749 /* We apply the current probability with exponent beta to work around
750 the fact that the probability estimates aren't independent. */
Jean-Marc Valin742aac12013-02-22 16:44:56 -0500751 p0 *= (float)pow(1-frame_probs[0], beta);
752 p1 *= (float)pow(frame_probs[0], beta);
Jean-Marc Valin0f5ff802013-07-01 18:39:34 -0400753 /* Normalise the probabilities to get the Marokv probability of music. */
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500754 tonal->music_prob = p1/(p0+p1);
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500755 info->music_prob = tonal->music_prob;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500756
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500757 /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/
Jean-Marc Valin0f5ff802013-07-01 18:39:34 -0400758 /* This chunk of code deals with delayed decision. */
Jean-Marc Valin16ba19a2013-06-27 03:40:44 -0400759 psum=1e-20f;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400760 /* Instantaneous probability of speech and music, with beta pre-applied. */
Robert Meakins097fd4b2013-03-11 12:59:15 -0400761 speech0 = (float)pow(1-frame_probs[0], beta);
762 music0 = (float)pow(frame_probs[0], beta);
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500763 if (tonal->count==1)
764 {
765 tonal->pspeech[0]=.5;
766 tonal->pmusic [0]=.5;
767 }
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400768 /* Updated probability of having only speech (s0) or only music (m0),
769 before considering the new observation. */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500770 s0 = tonal->pspeech[0] + tonal->pspeech[1];
771 m0 = tonal->pmusic [0] + tonal->pmusic [1];
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400772 /* Updates s0 and m0 with instantaneous probability. */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500773 tonal->pspeech[0] = s0*(1-tau)*speech0;
774 tonal->pmusic [0] = m0*(1-tau)*music0;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400775 /* Propagate the transition probabilities */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500776 for (i=1;i<DETECT_SIZE-1;i++)
777 {
778 tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
779 tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
780 }
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400781 /* Probability that the latest frame is speech, when all the previous ones were music. */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500782 tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400783 /* Probability that the latest frame is music, when all the previous ones were speech. */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500784 tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
785
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400786 /* Renormalise probabilities to 1 */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500787 for (i=0;i<DETECT_SIZE;i++)
788 psum += tonal->pspeech[i] + tonal->pmusic[i];
789 psum = 1.f/psum;
790 for (i=0;i<DETECT_SIZE;i++)
791 {
792 tonal->pspeech[i] *= psum;
793 tonal->pmusic [i] *= psum;
794 }
795 psum = tonal->pmusic[0];
796 for (i=1;i<DETECT_SIZE;i++)
797 psum += tonal->pspeech[i];
798
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500799 /* Estimate our confidence in the speech/music decisions */
Gregory Maxwell5280c712013-07-15 15:51:24 -0700800 if (frame_probs[1]>.75)
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500801 {
Gregory Maxwell5280c712013-07-15 15:51:24 -0700802 if (tonal->music_prob>.9)
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500803 {
804 float adapt;
805 adapt = 1.f/(++tonal->music_confidence_count);
806 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
807 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
808 }
Gregory Maxwell5280c712013-07-15 15:51:24 -0700809 if (tonal->music_prob<.1)
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500810 {
811 float adapt;
812 adapt = 1.f/(++tonal->speech_confidence_count);
813 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
814 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
815 }
816 } else {
817 if (tonal->music_confidence_count==0)
Jean-Marc Valin16ba19a2013-06-27 03:40:44 -0400818 tonal->music_confidence = .9f;
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500819 if (tonal->speech_confidence_count==0)
Jean-Marc Valin16ba19a2013-06-27 03:40:44 -0400820 tonal->speech_confidence = .1f;
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500821 }
Jean-Marc Valin9987a3b2011-11-17 19:21:07 +0800822 }
Jean-Marc Valind683c762012-12-21 16:17:38 -0500823 tonal->last_music = tonal->music_prob>.5f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500824#ifdef MLP_TRAINING
825 for (i=0;i<25;i++)
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800826 printf("%f ", features[i]);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500827 printf("\n");
828#endif
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800829
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500830 info->bandwidth = bandwidth;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500831 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
Jean-Marc Valin2a9fdbc2011-12-07 17:39:40 -0500832 info->noisiness = frame_noisiness;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800833 info->valid = 1;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400834 RESTORE_STACK;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500835}
836
Jean-Marc Valina4c25122013-09-28 17:22:41 -0400837void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
Jean-Marc Valinb90e63b2013-09-16 13:08:52 -0400838 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
Ralph Gilesd43445f2015-12-30 10:00:17 -0800839 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500840{
841 int offset;
842 int pcm_len;
843
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500844 analysis_frame_size -= analysis_frame_size&1;
Jean-Marc Valin29254442013-09-28 19:29:23 -0400845 if (analysis_pcm != NULL)
846 {
847 /* Avoid overflow/wrap-around of the analysis buffer */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500848 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500849
Jean-Marc Valin29254442013-09-28 19:29:23 -0400850 pcm_len = analysis_frame_size - analysis->analysis_offset;
851 offset = analysis->analysis_offset;
Jean-Marc Valindbff5fc2016-09-07 11:20:06 -0400852 while (pcm_len>0) {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500853 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
854 offset += Fs/50;
855 pcm_len -= Fs/50;
Jean-Marc Valindbff5fc2016-09-07 11:20:06 -0400856 }
Jean-Marc Valin29254442013-09-28 19:29:23 -0400857 analysis->analysis_offset = analysis_frame_size;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500858
Jean-Marc Valin29254442013-09-28 19:29:23 -0400859 analysis->analysis_offset -= frame_size;
860 }
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500861
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500862 analysis_info->valid = 0;
863 tonality_get_info(analysis, analysis_info, frame_size);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500864}
Mark Harrisd6d70372017-02-20 19:51:40 -0800865
866#endif /* DISABLE_FLOAT_API */