blob: 4ad44d3557dfcca5931af0ac66ea6b6db4420285 [file] [log] [blame]
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -05001/* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3/*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
Jean-Marc Valin2ff65562016-08-11 11:05:51 -040032#define ANALYSIS_C
33
34#include <stdio.h>
35
36#include "mathops.h"
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -050037#include "kiss_fft.h"
38#include "celt.h"
39#include "modes.h"
40#include "arch.h"
41#include "quant_bands.h"
Jean-Marc Valinc5880fe2012-07-05 01:28:45 -040042#include "analysis.h"
43#include "mlp.h"
Jean-Marc Valina8783a12013-04-15 15:49:40 -040044#include "stack_alloc.h"
Jean-Marc Valincf9409f2016-12-16 17:52:15 -050045#include "float_cast.h"
Jean-Marc Valinc5880fe2012-07-05 01:28:45 -040046
Jean-Marc Valin73eb3632011-11-16 10:47:15 +080047#ifndef M_PI
48#define M_PI 3.141592653
49#endif
50
Mark Harrisd6d70372017-02-20 19:51:40 -080051#ifndef DISABLE_FLOAT_API
52
Jean-Marc Valin027ff072012-09-14 15:49:18 -040053static const float dct_table[128] = {
Jean-Marc Valind683c762012-12-21 16:17:38 -050054 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
55 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
56 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
57 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
58 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
59 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
60 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
61 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
62 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
63 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
64 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
65 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
66 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
67 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
68 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
69 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
Jean-Marc Valin73eb3632011-11-16 10:47:15 +080070};
71
Jean-Marc Valin027ff072012-09-14 15:49:18 -040072static const float analysis_window[240] = {
Jean-Marc Valinb15b30c2012-07-04 12:08:22 -040073 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
74 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
75 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
76 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
77 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
78 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
79 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
80 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
81 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
82 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
83 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
84 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
85 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
86 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
87 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
88 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
89 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
90 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
91 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
92 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
93 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
94 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
95 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
96 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
97 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
98 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
99 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
100 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
101 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
102 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
103};
104
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500105static const int tbands[NB_TBANDS+1] = {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500106 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500107};
108
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500109#define NB_TONAL_SKIP_BANDS 9
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800110
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500111static opus_val32 silk_resampler_down2_hp(
112 opus_val32 *S, /* I/O State vector [ 2 ] */
113 opus_val32 *out, /* O Output signal [ floor(len/2) ] */
114 const opus_val32 *in, /* I Input signal [ len ] */
115 int inLen /* I Number of input samples */
116)
117{
118 int k, len2 = inLen/2;
119 opus_val32 in32, out32, out32_hp, Y, X;
120 opus_val64 hp_ener = 0;
121 /* Internal variables and state are in Q10 format */
122 for( k = 0; k < len2; k++ ) {
123 /* Convert to Q10 */
124 in32 = in[ 2 * k ];
Jean-Marc Valinb15b30c2012-07-04 12:08:22 -0400125
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500126 /* All-pass section for even input sample */
127 Y = SUB32( in32, S[ 0 ] );
128 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
129 out32 = ADD32( S[ 0 ], X );
130 S[ 0 ] = ADD32( in32, X );
131 out32_hp = out32;
132 /* Convert to Q10 */
133 in32 = in[ 2 * k + 1 ];
134
135 /* All-pass section for odd input sample, and add to output of previous section */
136 Y = SUB32( in32, S[ 1 ] );
137 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
138 out32 = ADD32( out32, S[ 1 ] );
139 out32 = ADD32( out32, X );
140 S[ 1 ] = ADD32( in32, X );
141
142 Y = SUB32( -in32, S[ 2 ] );
143 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
144 out32_hp = ADD32( out32_hp, S[ 2 ] );
145 out32_hp = ADD32( out32_hp, X );
146 S[ 2 ] = ADD32( -in32, X );
147
148 hp_ener += out32_hp*(opus_val64)out32_hp;
149 /* Add, convert back to int16 and store to output */
150 out[ k ] = HALF32(out32);
151 }
152#ifdef FIXED_POINT
153 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
154 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
155#endif
Mark Harrisd6d70372017-02-20 19:51:40 -0800156 return (opus_val32)hp_ener;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500157}
158
159static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
160{
161 VARDECL(opus_val32, tmp);
162 opus_val32 scale;
163 int j;
164 opus_val32 ret = 0;
165 SAVE_STACK;
166
167 if (subframe==0) return 0;
168 if (Fs == 48000)
169 {
170 subframe *= 2;
171 offset *= 2;
172 } else if (Fs == 16000) {
173 subframe = subframe*2/3;
174 offset = offset*2/3;
175 }
176 ALLOC(tmp, subframe, opus_val32);
177
178 downmix(_x, tmp, subframe, offset, c1, c2, C);
179#ifdef FIXED_POINT
180 scale = (1<<SIG_SHIFT);
181#else
182 scale = 1.f/32768;
183#endif
184 if (c2==-2)
185 scale /= C;
186 else if (c2>-1)
187 scale /= 2;
188 for (j=0;j<subframe;j++)
189 tmp[j] *= scale;
190 if (Fs == 48000)
191 {
192 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
193 } else if (Fs == 24000) {
194 OPUS_COPY(y, tmp, subframe);
195 } else if (Fs == 16000) {
196 VARDECL(opus_val32, tmp3x);
197 ALLOC(tmp3x, 3*subframe, opus_val32);
198 /* Don't do this at home! This resampler is horrible and it's only (barely)
199 usable for the purpose of the analysis because we don't care about all
200 the aliasing between 8 kHz and 12 kHz. */
201 for (j=0;j<subframe;j++)
202 {
203 tmp3x[3*j] = tmp[j];
204 tmp3x[3*j+1] = tmp[j];
205 tmp3x[3*j+2] = tmp[j];
206 }
207 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
208 }
209 RESTORE_STACK;
210 return ret;
211}
212
213void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
Ralph Gilesd43445f2015-12-30 10:00:17 -0800214{
215 /* Initialize reusable fields. */
216 tonal->arch = opus_select_arch();
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500217 tonal->Fs = Fs;
Ralph Giles9e3872a2015-12-30 15:27:02 -0800218 /* Clear remaining fields. */
219 tonality_analysis_reset(tonal);
220}
221
222void tonality_analysis_reset(TonalityAnalysisState *tonal)
223{
224 /* Clear non-reusable fields. */
225 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
226 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
Jean-Marc Valin15a30e22017-06-04 02:59:33 -0400227 tonal->music_confidence = .9f;
228 tonal->speech_confidence = .1f;
Ralph Gilesd43445f2015-12-30 10:00:17 -0800229}
230
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500231void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
232{
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500233 int pos;
234 int curr_lookahead;
235 float psum;
Jean-Marc Valin41443422017-06-09 22:16:53 -0400236 float tonality_max;
237 float tonality_avg;
238 int tonality_count;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500239 int i;
240
241 pos = tonal->read_pos;
242 curr_lookahead = tonal->write_pos-tonal->read_pos;
243 if (curr_lookahead<0)
244 curr_lookahead += DETECT_SIZE;
245
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500246 /* On long frames, look at the second analysis window rather than the first. */
247 if (len > tonal->Fs/50 && pos != tonal->write_pos)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500248 {
249 pos++;
250 if (pos==DETECT_SIZE)
251 pos=0;
252 }
253 if (pos == tonal->write_pos)
254 pos--;
255 if (pos<0)
256 pos = DETECT_SIZE-1;
257 OPUS_COPY(info_out, &tonal->info[pos], 1);
Jean-Marc Valin41443422017-06-09 22:16:53 -0400258 tonality_max = tonality_avg = info_out->tonality;
259 tonality_count = 1;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500260 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
261 for (i=0;i<3;i++)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500262 {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500263 pos++;
264 if (pos==DETECT_SIZE)
265 pos = 0;
266 if (pos == tonal->write_pos)
267 break;
Jean-Marc Valin41443422017-06-09 22:16:53 -0400268 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
269 tonality_avg += tonal->info[pos].tonality;
270 tonality_count++;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500271 }
Jean-Marc Valin41443422017-06-09 22:16:53 -0400272 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500273 tonal->read_subframe += len/(tonal->Fs/400);
274 while (tonal->read_subframe>=8)
275 {
276 tonal->read_subframe -= 8;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500277 tonal->read_pos++;
278 }
279 if (tonal->read_pos>=DETECT_SIZE)
280 tonal->read_pos-=DETECT_SIZE;
281
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500282 /* The -1 is to compensate for the delay in the features themselves. */
283 curr_lookahead = IMAX(curr_lookahead-1, 0);
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500284
285 psum=0;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400286 /* Summing the probability of transition patterns that involve music at
287 time (DETECT_SIZE-curr_lookahead-1) */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500288 for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
289 psum += tonal->pmusic[i];
290 for (;i<DETECT_SIZE;i++)
291 psum += tonal->pspeech[i];
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500292 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500293 /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500294
295 info_out->music_prob = psum;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500296}
297
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500298static const float std_feature_bias[9] = {
Mark Harrisd6d70372017-02-20 19:51:40 -0800299 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
300 2.163313f, 1.260756f, 1.116868f, 1.918795f
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500301};
302
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400303#define LEAKAGE_OFFSET 2.5f
304#define LEAKAGE_SLOPE 2.f
305
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400306#ifdef FIXED_POINT
307/* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
308 compensate for that in the energy. */
309#define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
310#define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
311#else
312#define SCALE_ENER(e) (e)
313#endif
314
Ralph Gilesd43445f2015-12-30 10:00:17 -0800315static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500316{
317 int i, b;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500318 const kiss_fft_state *kfft;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400319 VARDECL(kiss_fft_cpx, in);
320 VARDECL(kiss_fft_cpx, out);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500321 int N = 480, N2=240;
Joshua Bowman3b60e812012-10-23 12:18:38 -0700322 float * OPUS_RESTRICT A = tonal->angle;
323 float * OPUS_RESTRICT dA = tonal->d_angle;
324 float * OPUS_RESTRICT d2A = tonal->d2_angle;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400325 VARDECL(float, tonality);
326 VARDECL(float, noisiness);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500327 float band_tonality[NB_TBANDS];
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800328 float logE[NB_TBANDS];
329 float BFCC[8];
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400330 float features[25];
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500331 float frame_tonality;
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500332 float max_frame_tonality;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500333 /*float tw_sum=0;*/
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800334 float frame_noisiness;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500335 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500336 float slope=0;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800337 float frame_stationarity;
338 float relativeE;
Jean-Marc Valin742aac12013-02-22 16:44:56 -0500339 float frame_probs[2];
Jean-Marc Valina2054572011-11-25 23:07:46 -0500340 float alpha, alphaE, alphaE2;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500341 float frame_loudness;
Jean-Marc Valina2054572011-11-25 23:07:46 -0500342 float bandwidth_mask;
343 int bandwidth=0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500344 float maxE = 0;
345 float noise_floor;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500346 int remaining;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500347 AnalysisInfo *info;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500348 float hp_ener;
349 float tonality2[240];
350 float midE[8];
351 float spec_variability=0;
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400352 float band_log2[NB_TBANDS+1];
353 float leakage_from[NB_TBANDS+1];
354 float leakage_to[NB_TBANDS+1];
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400355 SAVE_STACK;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500356
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500357 alpha = 1.f/IMIN(10, 1+tonal->count);
358 alphaE = 1.f/IMIN(25, 1+tonal->count);
359 alphaE2 = 1.f/IMIN(500, 1+tonal->count);
360
361 if (tonal->Fs == 48000)
362 {
363 /* len and offset are now at 24 kHz. */
364 len/= 2;
365 offset /= 2;
366 } else if (tonal->Fs == 16000) {
367 len = 3*len/2;
368 offset = 3*offset/2;
369 }
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500370
Jean-Marc Valin1d7dea12017-06-04 17:45:06 -0400371 if (tonal->count<4) {
372 if (tonal->application == OPUS_APPLICATION_VOIP)
373 tonal->music_prob = .1;
374 else
375 tonal->music_prob = .625;
376 }
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500377 kfft = celt_mode->mdct.kfft[0];
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500378 if (tonal->count==0)
379 tonal->mem_fill = 240;
Mark Harrisd6d70372017-02-20 19:51:40 -0800380 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
381 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500382 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500383 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
384 {
385 tonal->mem_fill += len;
386 /* Don't have enough to update the analysis */
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400387 RESTORE_STACK;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500388 return;
389 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500390 hp_ener = tonal->hp_ener_accum;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500391 info = &tonal->info[tonal->write_pos++];
392 if (tonal->write_pos>=DETECT_SIZE)
393 tonal->write_pos-=DETECT_SIZE;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500394
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400395 ALLOC(in, 480, kiss_fft_cpx);
396 ALLOC(out, 480, kiss_fft_cpx);
397 ALLOC(tonality, 240, float);
398 ALLOC(noisiness, 240, float);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500399 for (i=0;i<N2;i++)
400 {
401 float w = analysis_window[i];
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500402 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
403 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
404 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
405 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500406 }
407 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
408 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
Mark Harrisd6d70372017-02-20 19:51:40 -0800409 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
410 &tonal->inmem[240], tonal->downmix_state, remaining,
411 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500412 tonal->mem_fill = 240 + remaining;
Ralph Gilesd43445f2015-12-30 10:00:17 -0800413 opus_fft(kfft, in, out, tonal->arch);
Jean-Marc Valin122971b2013-12-10 13:55:35 -0500414#ifndef FIXED_POINT
415 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
416 if (celt_isnan(out[0].r))
417 {
418 info->valid = 0;
419 RESTORE_STACK;
420 return;
421 }
422#endif
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500423
424 for (i=1;i<N2;i++)
425 {
426 float X1r, X2r, X1i, X2i;
427 float angle, d_angle, d2_angle;
428 float angle2, d_angle2, d2_angle2;
429 float mod1, mod2, avg_mod;
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500430 X1r = (float)out[i].r+out[N-i].r;
431 X1i = (float)out[i].i-out[N-i].i;
432 X2r = (float)out[i].i+out[N-i].i;
433 X2i = (float)out[N-i].r-out[i].r;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800434
Jean-Marc Valind683c762012-12-21 16:17:38 -0500435 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500436 d_angle = angle - A[i];
437 d2_angle = d_angle - dA[i];
438
Jean-Marc Valind683c762012-12-21 16:17:38 -0500439 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500440 d_angle2 = angle2 - angle;
441 d2_angle2 = d_angle2 - d_angle;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500442
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500443 mod1 = d2_angle - (float)float2int(d2_angle);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500444 noisiness[i] = ABS16(mod1);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500445 mod1 *= mod1;
446 mod1 *= mod1;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800447
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500448 mod2 = d2_angle2 - (float)float2int(d2_angle2);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500449 noisiness[i] += ABS16(mod2);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500450 mod2 *= mod2;
451 mod2 *= mod2;
452
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500453 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
454 /* This introduces an extra delay of 2 frames in the detection. */
Jean-Marc Valind683c762012-12-21 16:17:38 -0500455 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500456 /* No delay on this detection, but it's less reliable. */
457 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500458
459 A[i] = angle2;
460 dA[i] = d_angle2;
461 d2A[i] = mod2;
462 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500463 for (i=2;i<N2-1;i++)
464 {
465 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
Mark Harrisd6d70372017-02-20 19:51:40 -0800466 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500467 }
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500468 frame_tonality = 0;
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500469 max_frame_tonality = 0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500470 /*tw_sum = 0;*/
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800471 info->activity = 0;
472 frame_noisiness = 0;
473 frame_stationarity = 0;
474 if (!tonal->count)
475 {
476 for (b=0;b<NB_TBANDS;b++)
477 {
478 tonal->lowE[b] = 1e10;
479 tonal->highE[b] = -1e10;
480 }
481 }
482 relativeE = 0;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500483 frame_loudness = 0;
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400484 /* The energy of the very first band is special because of DC. */
485 {
486 float E = 0;
487 float X1r, X2r;
488 X1r = 2*(float)out[0].r;
489 X2r = 2*(float)out[0].i;
490 E = X1r*X1r + X2r*X2r;
491 for (i=1;i<4;i++)
492 {
493 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
494 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
495 E += binE;
496 }
497 E = SCALE_ENER(E);
Jean-Marc Valin7be054b2017-06-01 13:21:59 -0400498 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400499 }
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500500 for (b=0;b<NB_TBANDS;b++)
501 {
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800502 float E=0, tE=0, nE=0;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500503 float L1, L2;
504 float stationarity;
505 for (i=tbands[b];i<tbands[b+1];i++)
506 {
Jean-Marc Valin3ab03e02013-09-06 16:00:39 -0400507 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
508 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400509 binE = SCALE_ENER(binE);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500510 E += binE;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500511 tE += binE*MAX32(0, tonality[i]);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500512 nE += binE*2.f*(.5f-noisiness[i]);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500513 }
Jean-Marc Valin122971b2013-12-10 13:55:35 -0500514#ifndef FIXED_POINT
515 /* Check for extreme band energies that could cause NaNs later. */
516 if (!(E<1e9f) || celt_isnan(E))
517 {
518 info->valid = 0;
519 RESTORE_STACK;
520 return;
521 }
522#endif
523
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500524 tonal->E[tonal->E_count][b] = E;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500525 frame_noisiness += nE/(1e-15f+E);
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800526
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500527 frame_loudness += (float)sqrt(E+1e-10f);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500528 logE[b] = (float)log(E+1e-10f);
Jean-Marc Valin7be054b2017-06-01 13:21:59 -0400529 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500530 tonal->logE[tonal->E_count][b] = logE[b];
531 if (tonal->count==0)
532 tonal->highE[b] = tonal->lowE[b] = logE[b];
533 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800534 {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500535 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
Mark Harrisd6d70372017-02-20 19:51:40 -0800536 tonal->highE[b] -= .01f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500537 else
Mark Harrisd6d70372017-02-20 19:51:40 -0800538 tonal->lowE[b] += .01f;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800539 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500540 if (logE[b] > tonal->highE[b])
541 {
542 tonal->highE[b] = logE[b];
543 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
544 } else if (logE[b] < tonal->lowE[b])
545 {
546 tonal->lowE[b] = logE[b];
547 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
548 }
549 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800550
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500551 L1=L2=0;
552 for (i=0;i<NB_FRAMES;i++)
553 {
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500554 L1 += (float)sqrt(tonal->E[i][b]);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500555 L2 += tonal->E[i][b];
556 }
557
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500558 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500559 stationarity *= stationarity;
560 stationarity *= stationarity;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800561 frame_stationarity += stationarity;
562 /*band_tonality[b] = tE/(1e-15+E)*/;
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500563 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500564#if 0
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800565 if (b>=NB_TONAL_SKIP_BANDS)
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500566 {
567 frame_tonality += tweight[b]*band_tonality[b];
568 tw_sum += tweight[b];
569 }
570#else
571 frame_tonality += band_tonality[b];
572 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
573 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
574#endif
Jean-Marc Valind683c762012-12-21 16:17:38 -0500575 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500576 slope += band_tonality[b]*(b-8);
Jean-Marc Valin70d90d12011-11-28 14:17:47 -0500577 /*printf("%f %f ", band_tonality[b], stationarity);*/
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500578 tonal->prev_band_tonality[b] = band_tonality[b];
579 }
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500580
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400581 leakage_from[0] = band_log2[0];
582 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
583 for (b=1;b<NB_TBANDS+1;b++)
584 {
585 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
586 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
587 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
588 }
589 for (b=NB_TBANDS-2;b>=0;b--)
590 {
591 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
592 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
593 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
594 }
595 celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
596 for (b=0;b<NB_TBANDS+1;b++)
597 {
Jean-Marc Valincf86d252017-06-01 17:56:31 -0400598 /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400599 represents the boost needed to overcome the amount of analysis leakage
Jean-Marc Valincf86d252017-06-01 17:56:31 -0400600 cause in a weaker band b by louder neighbouring bands.
601 The second, based on leakage_from[], applies to a loud band b for
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400602 which the quantization noise causes synthesis leakage to the weaker
603 neighbouring bands. */
604 float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
605 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
606 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
607 }
608 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
609
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500610 for (i=0;i<NB_FRAMES;i++)
611 {
612 int j;
Mark Harrisd6d70372017-02-20 19:51:40 -0800613 float mindist = 1e15f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500614 for (j=0;j<NB_FRAMES;j++)
615 {
616 int k;
617 float dist=0;
618 for (k=0;k<NB_TBANDS;k++)
619 {
620 float tmp;
621 tmp = tonal->logE[i][k] - tonal->logE[j][k];
622 dist += tmp*tmp;
623 }
624 if (j!=i)
625 mindist = MIN32(mindist, dist);
626 }
627 spec_variability += mindist;
628 }
Mark Harrisd6d70372017-02-20 19:51:40 -0800629 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500630 bandwidth_mask = 0;
631 bandwidth = 0;
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400632 maxE = 0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500633 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
634 noise_floor *= noise_floor;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500635 for (b=0;b<NB_TBANDS;b++)
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500636 {
637 float E=0;
638 int band_start, band_end;
639 /* Keep a margin of 300 Hz for aliasing */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500640 band_start = tbands[b];
641 band_end = tbands[b+1];
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500642 for (i=band_start;i<band_end;i++)
643 {
Jean-Marc Valin3ab03e02013-09-06 16:00:39 -0400644 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
645 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500646 E += binE;
647 }
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400648 E = SCALE_ENER(E);
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500649 maxE = MAX32(maxE, E);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400650 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500651 E = MAX32(E, tonal->meanE[b]);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400652 /* Use a simple follower with 13 dB/Bark slope for spreading function */
Jean-Marc Valind683c762012-12-21 16:17:38 -0500653 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400654 /* Consider the band "active" only if all these conditions are met:
655 1) less than 10 dB below the simple follower
656 2) less than 90 dB below the peak band (maximal masking possible considering
657 both the ATH and the loudness-dependent slope of the spreading function)
658 3) above the PCM quantization noise floor
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500659 We use b+1 because the first CELT band isn't included in tbands[]
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400660 */
Gregory Maxwell5280c712013-07-15 15:51:24 -0700661 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500662 bandwidth = b+1;
663 }
664 /* Special case for the last two bands, for which we don't have spectrum but only
665 the energy above 12 kHz. */
666 {
Mark Harrisd6d70372017-02-20 19:51:40 -0800667 float E = hp_ener*(1.f/(240*240));
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500668#ifdef FIXED_POINT
669 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400670 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500671#endif
672 maxE = MAX32(maxE, E);
673 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
674 E = MAX32(E, tonal->meanE[b]);
675 /* Use a simple follower with 13 dB/Bark slope for spreading function */
676 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
677 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)
678 bandwidth = 20;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500679 }
680 if (tonal->count<=2)
681 bandwidth = 20;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500682 frame_loudness = 20*(float)log10(frame_loudness);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500683 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500684 tonal->lowECount *= (1-alphaE);
685 if (frame_loudness < tonal->Etracker-30)
686 tonal->lowECount += alphaE;
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800687
688 for (i=0;i<8;i++)
689 {
690 float sum=0;
691 for (b=0;b<16;b++)
692 sum += dct_table[i*16+b]*logE[b];
693 BFCC[i] = sum;
694 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500695 for (i=0;i<8;i++)
696 {
697 float sum=0;
698 for (b=0;b<16;b++)
Mark Harrisd6d70372017-02-20 19:51:40 -0800699 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500700 midE[i] = sum;
701 }
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800702
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800703 frame_stationarity /= NB_TBANDS;
704 relativeE /= NB_TBANDS;
705 if (tonal->count<10)
706 relativeE = .5;
707 frame_noisiness /= NB_TBANDS;
708#if 1
709 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
710#else
711 info->activity = .5*(1+frame_noisiness-frame_stationarity);
712#endif
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500713 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
Jean-Marc Valind683c762012-12-21 16:17:38 -0500714 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500715 tonal->prev_tonality = frame_tonality;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500716
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500717 slope /= 8*8;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800718 info->tonality_slope = slope;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500719
720 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500721 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800722 info->tonality = frame_tonality;
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800723
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500724 for (i=0;i<4;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500725 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500726
Jean-Marc Valincd213ea2011-11-21 21:57:10 -0500727 for (i=0;i<4;i++)
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500728 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
729
730 for (i=0;i<4;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500731 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500732 for (i=0;i<3;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500733 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
Jean-Marc Valincd213ea2011-11-21 21:57:10 -0500734
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500735 if (tonal->count > 5)
736 {
737 for (i=0;i<9;i++)
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500738 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500739 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500740 for (i=0;i<4;i++)
741 features[i] = BFCC[i]-midE[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500742
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800743 for (i=0;i<8;i++)
744 {
745 tonal->mem[i+24] = tonal->mem[i+16];
746 tonal->mem[i+16] = tonal->mem[i+8];
747 tonal->mem[i+8] = tonal->mem[i];
748 tonal->mem[i] = BFCC[i];
749 }
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500750 for (i=0;i<9;i++)
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500751 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
Mark Harrisd6d70372017-02-20 19:51:40 -0800752 features[18] = spec_variability - 0.78f;
753 features[20] = info->tonality - 0.154723f;
754 features[21] = info->activity - 0.724643f;
755 features[22] = frame_stationarity - 0.743717f;
756 features[23] = info->tonality_slope + 0.069216f;
757 features[24] = tonal->lowECount - 0.067930f;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500758
Jean-Marc Valin742aac12013-02-22 16:44:56 -0500759 mlp_process(&net, features, frame_probs);
760 frame_probs[0] = .5f*(frame_probs[0]+1);
Jean-Marc Valin317ffc22012-10-09 02:12:02 -0400761 /* Curve fitting between the MLP probability and the actual probability */
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500762 /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400763 /* Probability of active audio (as opposed to silence) */
Jean-Marc Valine83a3652013-06-27 14:45:50 -0400764 frame_probs[1] = .5f*frame_probs[1]+.5f;
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500765 frame_probs[1] *= frame_probs[1];
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500766
Felicia Lim36481342016-05-16 15:29:53 +0200767 /* Probability of speech or music vs noise */
768 info->activity_probability = frame_probs[1];
769
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500770 /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
Jean-Marc Valin9987a3b2011-11-17 19:21:07 +0800771 {
Jean-Marc Valin0f5ff802013-07-01 18:39:34 -0400772 /* Probability of state transition */
773 float tau;
774 /* Represents independence of the MLP probabilities, where
775 beta=1 means fully independent. */
776 float beta;
777 /* Denormalized probability of speech (p0) and music (p1) after update */
Jean-Marc Valin9987a3b2011-11-17 19:21:07 +0800778 float p0, p1;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400779 /* Probabilities for "all speech" and "all music" */
Robert Meakins097fd4b2013-03-11 12:59:15 -0400780 float s0, m0;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400781 /* Probability sum for renormalisation */
Robert Meakins097fd4b2013-03-11 12:59:15 -0400782 float psum;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400783 /* Instantaneous probability of speech and music, with beta pre-applied. */
Robert Meakins097fd4b2013-03-11 12:59:15 -0400784 float speech0;
785 float music0;
Jean-Marc Valin9b1a27a2016-06-29 23:11:57 -0400786 float p, q;
Robert Meakins097fd4b2013-03-11 12:59:15 -0400787
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500788 /* More silence transitions for speech than for music. */
789 tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
790 p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
791 q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
792 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
793 /* p0 and p1 are the probabilities of speech and music at this frame
794 using only information from previous frame and applying the
795 state transition model */
796 p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau;
797 p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
798 /* We apply the current probability with exponent beta to work around
799 the fact that the probability estimates aren't independent. */
800 p0 *= (float)pow(1-frame_probs[1], beta);
801 p1 *= (float)pow(frame_probs[1], beta);
802 /* Normalise the probabilities to get the Marokv probability of music. */
803 tonal->vad_prob = p1/(p0+p1);
804 info->vad_prob = tonal->vad_prob;
805 /* Consider that silence has a 50-50 probability of being speech or music. */
806 frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;
807
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400808 /* One transition every 3 minutes of active audio */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500809 tau = .0001f;
Jean-Marc Valin9b1a27a2016-06-29 23:11:57 -0400810 /* Adapt beta based on how "unexpected" the new prob is */
811 p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
812 q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500813 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
Jean-Marc Valin0f5ff802013-07-01 18:39:34 -0400814 /* p0 and p1 are the probabilities of speech and music at this frame
815 using only information from previous frame and applying the
816 state transition model */
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500817 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
818 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
Jean-Marc Valin0f5ff802013-07-01 18:39:34 -0400819 /* We apply the current probability with exponent beta to work around
820 the fact that the probability estimates aren't independent. */
Jean-Marc Valin742aac12013-02-22 16:44:56 -0500821 p0 *= (float)pow(1-frame_probs[0], beta);
822 p1 *= (float)pow(frame_probs[0], beta);
Jean-Marc Valin0f5ff802013-07-01 18:39:34 -0400823 /* Normalise the probabilities to get the Marokv probability of music. */
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500824 tonal->music_prob = p1/(p0+p1);
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500825 info->music_prob = tonal->music_prob;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500826
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500827 /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/
Jean-Marc Valin0f5ff802013-07-01 18:39:34 -0400828 /* This chunk of code deals with delayed decision. */
Jean-Marc Valin16ba19a2013-06-27 03:40:44 -0400829 psum=1e-20f;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400830 /* Instantaneous probability of speech and music, with beta pre-applied. */
Robert Meakins097fd4b2013-03-11 12:59:15 -0400831 speech0 = (float)pow(1-frame_probs[0], beta);
832 music0 = (float)pow(frame_probs[0], beta);
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500833 if (tonal->count==1)
834 {
Jean-Marc Valin1d7dea12017-06-04 17:45:06 -0400835 if (tonal->application == OPUS_APPLICATION_VOIP)
836 tonal->pmusic[0] = .1;
837 else
838 tonal->pmusic[0] = .625;
839 tonal->pspeech[0] = 1-tonal->pmusic[0];
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500840 }
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400841 /* Updated probability of having only speech (s0) or only music (m0),
842 before considering the new observation. */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500843 s0 = tonal->pspeech[0] + tonal->pspeech[1];
844 m0 = tonal->pmusic [0] + tonal->pmusic [1];
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400845 /* Updates s0 and m0 with instantaneous probability. */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500846 tonal->pspeech[0] = s0*(1-tau)*speech0;
847 tonal->pmusic [0] = m0*(1-tau)*music0;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400848 /* Propagate the transition probabilities */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500849 for (i=1;i<DETECT_SIZE-1;i++)
850 {
851 tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
852 tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
853 }
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400854 /* Probability that the latest frame is speech, when all the previous ones were music. */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500855 tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400856 /* Probability that the latest frame is music, when all the previous ones were speech. */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500857 tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
858
Jean-Marc Valin4eb399d2013-07-01 20:19:24 -0400859 /* Renormalise probabilities to 1 */
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500860 for (i=0;i<DETECT_SIZE;i++)
861 psum += tonal->pspeech[i] + tonal->pmusic[i];
862 psum = 1.f/psum;
863 for (i=0;i<DETECT_SIZE;i++)
864 {
865 tonal->pspeech[i] *= psum;
866 tonal->pmusic [i] *= psum;
867 }
868 psum = tonal->pmusic[0];
869 for (i=1;i<DETECT_SIZE;i++)
870 psum += tonal->pspeech[i];
871
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500872 /* Estimate our confidence in the speech/music decisions */
Gregory Maxwell5280c712013-07-15 15:51:24 -0700873 if (frame_probs[1]>.75)
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500874 {
Gregory Maxwell5280c712013-07-15 15:51:24 -0700875 if (tonal->music_prob>.9)
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500876 {
877 float adapt;
878 adapt = 1.f/(++tonal->music_confidence_count);
879 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
880 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
881 }
Gregory Maxwell5280c712013-07-15 15:51:24 -0700882 if (tonal->music_prob<.1)
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500883 {
884 float adapt;
885 adapt = 1.f/(++tonal->speech_confidence_count);
886 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
887 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
888 }
Jean-Marc Valin73142b12013-02-28 15:30:51 -0500889 }
Jean-Marc Valin9987a3b2011-11-17 19:21:07 +0800890 }
Jean-Marc Valind683c762012-12-21 16:17:38 -0500891 tonal->last_music = tonal->music_prob>.5f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500892#ifdef MLP_TRAINING
893 for (i=0;i<25;i++)
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800894 printf("%f ", features[i]);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500895 printf("\n");
896#endif
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800897
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500898 info->bandwidth = bandwidth;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500899 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
Jean-Marc Valin2a9fdbc2011-12-07 17:39:40 -0500900 info->noisiness = frame_noisiness;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800901 info->valid = 1;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400902 RESTORE_STACK;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500903}
904
Jean-Marc Valina4c25122013-09-28 17:22:41 -0400905void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
Jean-Marc Valinb90e63b2013-09-16 13:08:52 -0400906 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
Ralph Gilesd43445f2015-12-30 10:00:17 -0800907 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500908{
909 int offset;
910 int pcm_len;
911
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500912 analysis_frame_size -= analysis_frame_size&1;
Jean-Marc Valin29254442013-09-28 19:29:23 -0400913 if (analysis_pcm != NULL)
914 {
915 /* Avoid overflow/wrap-around of the analysis buffer */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500916 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500917
Jean-Marc Valin29254442013-09-28 19:29:23 -0400918 pcm_len = analysis_frame_size - analysis->analysis_offset;
919 offset = analysis->analysis_offset;
Jean-Marc Valindbff5fc2016-09-07 11:20:06 -0400920 while (pcm_len>0) {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500921 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
922 offset += Fs/50;
923 pcm_len -= Fs/50;
Jean-Marc Valindbff5fc2016-09-07 11:20:06 -0400924 }
Jean-Marc Valin29254442013-09-28 19:29:23 -0400925 analysis->analysis_offset = analysis_frame_size;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500926
Jean-Marc Valin29254442013-09-28 19:29:23 -0400927 analysis->analysis_offset -= frame_size;
928 }
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500929
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500930 analysis_info->valid = 0;
931 tonality_get_info(analysis, analysis_info, frame_size);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500932}
Mark Harrisd6d70372017-02-20 19:51:40 -0800933
934#endif /* DISABLE_FLOAT_API */