blob: 11f871263b2b63c7ede4d6be7b80c6b8fe030399 [file] [log] [blame]
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -05001/* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3/*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
Jean-Marc Valin2ff65562016-08-11 11:05:51 -040032#define ANALYSIS_C
33
34#include <stdio.h>
35
36#include "mathops.h"
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -050037#include "kiss_fft.h"
38#include "celt.h"
39#include "modes.h"
40#include "arch.h"
41#include "quant_bands.h"
Jean-Marc Valinc5880fe2012-07-05 01:28:45 -040042#include "analysis.h"
43#include "mlp.h"
Jean-Marc Valina8783a12013-04-15 15:49:40 -040044#include "stack_alloc.h"
Jean-Marc Valincf9409f2016-12-16 17:52:15 -050045#include "float_cast.h"
Jean-Marc Valinc5880fe2012-07-05 01:28:45 -040046
Jean-Marc Valin73eb3632011-11-16 10:47:15 +080047#ifndef M_PI
48#define M_PI 3.141592653
49#endif
50
Mark Harrisd6d70372017-02-20 19:51:40 -080051#ifndef DISABLE_FLOAT_API
52
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -040053#define TRANSITION_PENALTY 10
54
Jean-Marc Valin027ff072012-09-14 15:49:18 -040055static const float dct_table[128] = {
Jean-Marc Valind683c762012-12-21 16:17:38 -050056 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
57 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
58 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
59 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
60 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
61 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
62 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
63 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
64 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
65 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
66 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
67 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
68 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
69 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
70 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
71 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
Jean-Marc Valin73eb3632011-11-16 10:47:15 +080072};
73
Jean-Marc Valin027ff072012-09-14 15:49:18 -040074static const float analysis_window[240] = {
Jean-Marc Valinb15b30c2012-07-04 12:08:22 -040075 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
76 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
77 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
78 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
79 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
80 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
81 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
82 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
83 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
84 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
85 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
86 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
87 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
88 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
89 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
90 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
91 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
92 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
93 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
94 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
95 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
96 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
97 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
98 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
99 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
100 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
101 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
102 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
103 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
104 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
105};
106
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500107static const int tbands[NB_TBANDS+1] = {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500108 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500109};
110
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500111#define NB_TONAL_SKIP_BANDS 9
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800112
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500113static opus_val32 silk_resampler_down2_hp(
114 opus_val32 *S, /* I/O State vector [ 2 ] */
115 opus_val32 *out, /* O Output signal [ floor(len/2) ] */
116 const opus_val32 *in, /* I Input signal [ len ] */
117 int inLen /* I Number of input samples */
118)
119{
120 int k, len2 = inLen/2;
121 opus_val32 in32, out32, out32_hp, Y, X;
122 opus_val64 hp_ener = 0;
123 /* Internal variables and state are in Q10 format */
124 for( k = 0; k < len2; k++ ) {
125 /* Convert to Q10 */
126 in32 = in[ 2 * k ];
Jean-Marc Valinb15b30c2012-07-04 12:08:22 -0400127
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500128 /* All-pass section for even input sample */
129 Y = SUB32( in32, S[ 0 ] );
130 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
131 out32 = ADD32( S[ 0 ], X );
132 S[ 0 ] = ADD32( in32, X );
133 out32_hp = out32;
134 /* Convert to Q10 */
135 in32 = in[ 2 * k + 1 ];
136
137 /* All-pass section for odd input sample, and add to output of previous section */
138 Y = SUB32( in32, S[ 1 ] );
139 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
140 out32 = ADD32( out32, S[ 1 ] );
141 out32 = ADD32( out32, X );
142 S[ 1 ] = ADD32( in32, X );
143
144 Y = SUB32( -in32, S[ 2 ] );
145 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
146 out32_hp = ADD32( out32_hp, S[ 2 ] );
147 out32_hp = ADD32( out32_hp, X );
148 S[ 2 ] = ADD32( -in32, X );
149
150 hp_ener += out32_hp*(opus_val64)out32_hp;
151 /* Add, convert back to int16 and store to output */
152 out[ k ] = HALF32(out32);
153 }
154#ifdef FIXED_POINT
155 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
156 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
157#endif
Mark Harrisd6d70372017-02-20 19:51:40 -0800158 return (opus_val32)hp_ener;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500159}
160
161static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
162{
163 VARDECL(opus_val32, tmp);
164 opus_val32 scale;
165 int j;
166 opus_val32 ret = 0;
167 SAVE_STACK;
168
169 if (subframe==0) return 0;
170 if (Fs == 48000)
171 {
172 subframe *= 2;
173 offset *= 2;
174 } else if (Fs == 16000) {
175 subframe = subframe*2/3;
176 offset = offset*2/3;
177 }
178 ALLOC(tmp, subframe, opus_val32);
179
180 downmix(_x, tmp, subframe, offset, c1, c2, C);
181#ifdef FIXED_POINT
182 scale = (1<<SIG_SHIFT);
183#else
184 scale = 1.f/32768;
185#endif
186 if (c2==-2)
187 scale /= C;
188 else if (c2>-1)
189 scale /= 2;
190 for (j=0;j<subframe;j++)
191 tmp[j] *= scale;
192 if (Fs == 48000)
193 {
194 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
195 } else if (Fs == 24000) {
196 OPUS_COPY(y, tmp, subframe);
197 } else if (Fs == 16000) {
198 VARDECL(opus_val32, tmp3x);
199 ALLOC(tmp3x, 3*subframe, opus_val32);
200 /* Don't do this at home! This resampler is horrible and it's only (barely)
201 usable for the purpose of the analysis because we don't care about all
202 the aliasing between 8 kHz and 12 kHz. */
203 for (j=0;j<subframe;j++)
204 {
205 tmp3x[3*j] = tmp[j];
206 tmp3x[3*j+1] = tmp[j];
207 tmp3x[3*j+2] = tmp[j];
208 }
209 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
210 }
211 RESTORE_STACK;
212 return ret;
213}
214
215void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
Ralph Gilesd43445f2015-12-30 10:00:17 -0800216{
217 /* Initialize reusable fields. */
218 tonal->arch = opus_select_arch();
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500219 tonal->Fs = Fs;
Ralph Giles9e3872a2015-12-30 15:27:02 -0800220 /* Clear remaining fields. */
221 tonality_analysis_reset(tonal);
222}
223
224void tonality_analysis_reset(TonalityAnalysisState *tonal)
225{
226 /* Clear non-reusable fields. */
227 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
228 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
Ralph Gilesd43445f2015-12-30 10:00:17 -0800229}
230
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500231void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
232{
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500233 int pos;
234 int curr_lookahead;
Jean-Marc Valin41443422017-06-09 22:16:53 -0400235 float tonality_max;
236 float tonality_avg;
237 int tonality_count;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500238 int i;
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400239 int pos0;
240 float prob_avg;
241 float prob_count;
242 float prob_min, prob_max;
243 float vad_prob;
244 int mpos, vpos;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500245
246 pos = tonal->read_pos;
247 curr_lookahead = tonal->write_pos-tonal->read_pos;
248 if (curr_lookahead<0)
249 curr_lookahead += DETECT_SIZE;
250
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500251 /* On long frames, look at the second analysis window rather than the first. */
252 if (len > tonal->Fs/50 && pos != tonal->write_pos)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500253 {
254 pos++;
255 if (pos==DETECT_SIZE)
256 pos=0;
257 }
258 if (pos == tonal->write_pos)
259 pos--;
260 if (pos<0)
261 pos = DETECT_SIZE-1;
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400262 pos0 = pos;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500263 OPUS_COPY(info_out, &tonal->info[pos], 1);
Jean-Marc Valin41443422017-06-09 22:16:53 -0400264 tonality_max = tonality_avg = info_out->tonality;
265 tonality_count = 1;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500266 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
267 for (i=0;i<3;i++)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500268 {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500269 pos++;
270 if (pos==DETECT_SIZE)
271 pos = 0;
272 if (pos == tonal->write_pos)
273 break;
Jean-Marc Valin41443422017-06-09 22:16:53 -0400274 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
275 tonality_avg += tonal->info[pos].tonality;
276 tonality_count++;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500277 }
Jean-Marc Valin5bb3cbc2017-06-19 17:55:56 -0400278 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400279
280 mpos = vpos = pos0;
281 /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and
282 ~1 frame delay in the VAD prob. */
283 if (curr_lookahead > 15)
284 {
285 mpos += 5;
286 if (mpos>=DETECT_SIZE)
287 mpos -= DETECT_SIZE;
288 vpos += 1;
289 if (vpos>=DETECT_SIZE)
290 vpos -= DETECT_SIZE;
291 }
292
293 /* The following calculations attempt to minimize a "badness function"
294 for the transition. When switching from speech to music, the badness
295 of switching at frame k is
296 b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
297 where
298 v_i is the activity probability (VAD) at frame i,
299 p_i is the music probability at frame i
300 T is the probability threshold for switching
301 S is the penalty for switching during active audio rather than silence
302 the current frame has index i=0
303
304 Rather than apply badness to directly decide when to switch, what we compute
305 instead is the threshold for which the optimal switching point is now. When
306 considering whether to switch now (frame 0) or at frame k, we have:
307 S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
308 which gives us:
309 T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i )
310 We take the min threshold across all positive values of k (up to the maximum
311 amount of lookahead we have) to give us the threshold for which the current
312 frame is the optimal switch point.
313
314 The last step is that we need to consider whether we want to switch at all.
315 For that we use the average of the music probability over the entire window.
316 If the threshold is higher than that average we're not going to
317 switch, so we compute a min with the average as well. The result of all these
318 min operations is music_prob_min, which gives the threshold for switching to music
319 if we're currently encoding for speech.
320
321 We do the exact opposite to compute music_prob_max which is used for switching
322 from music to speech.
323 */
324 prob_min = 1.f;
325 prob_max = 0.f;
326 vad_prob = tonal->info[vpos].activity_probability;
327 prob_count = MAX16(.1f, vad_prob);
328 prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob;
329 while (1)
330 {
331 float pos_vad;
332 mpos++;
333 if (mpos==DETECT_SIZE)
334 mpos = 0;
335 if (mpos == tonal->write_pos)
336 break;
337 vpos++;
338 if (vpos==DETECT_SIZE)
339 vpos = 0;
340 if (vpos == tonal->write_pos)
341 break;
342 pos_vad = tonal->info[vpos].activity_probability;
343 prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min);
344 prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max);
345 prob_count += MAX16(.1f, pos_vad);
346 prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob;
347 }
348 info_out->music_prob = prob_avg/prob_count;
349 prob_min = MIN16(prob_avg/prob_count, prob_min);
350 prob_max = MAX16(prob_avg/prob_count, prob_max);
351 prob_min = MAX16(prob_min, 0.f);
352 prob_max = MIN16(prob_max, 1.f);
353
354 /* If we don't have enough look-ahead, do our best to make a decent decision. */
355 if (curr_lookahead < 10)
356 {
357 float pmin, pmax;
358 pmin = prob_min;
359 pmax = prob_max;
360 pos = pos0;
361 /* Look for min/max in the past. */
362 for (i=0;i<IMIN(tonal->count-1, 15);i++)
363 {
364 pos--;
365 if (pos < 0)
366 pos = DETECT_SIZE-1;
367 pmin = MIN16(pmin, tonal->info[pos].music_prob);
368 pmax = MAX16(pmax, tonal->info[pos].music_prob);
369 }
370 /* Bias against switching on active audio. */
371 pmin = MAX16(0.f, pmin - .1f*vad_prob);
372 pmax = MIN16(1.f, pmax + .1f*vad_prob);
373 prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min);
374 prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max);
375 }
376 info_out->music_prob_min = prob_min;
377 info_out->music_prob_max = prob_max;
378
379 /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500380 tonal->read_subframe += len/(tonal->Fs/400);
381 while (tonal->read_subframe>=8)
382 {
383 tonal->read_subframe -= 8;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500384 tonal->read_pos++;
385 }
386 if (tonal->read_pos>=DETECT_SIZE)
387 tonal->read_pos-=DETECT_SIZE;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500388}
389
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500390static const float std_feature_bias[9] = {
Mark Harrisd6d70372017-02-20 19:51:40 -0800391 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
392 2.163313f, 1.260756f, 1.116868f, 1.918795f
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500393};
394
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400395#define LEAKAGE_OFFSET 2.5f
396#define LEAKAGE_SLOPE 2.f
397
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400398#ifdef FIXED_POINT
399/* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
400 compensate for that in the energy. */
401#define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
402#define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
403#else
404#define SCALE_ENER(e) (e)
405#endif
406
Ralph Gilesd43445f2015-12-30 10:00:17 -0800407static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500408{
409 int i, b;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500410 const kiss_fft_state *kfft;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400411 VARDECL(kiss_fft_cpx, in);
412 VARDECL(kiss_fft_cpx, out);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500413 int N = 480, N2=240;
Joshua Bowman3b60e812012-10-23 12:18:38 -0700414 float * OPUS_RESTRICT A = tonal->angle;
415 float * OPUS_RESTRICT dA = tonal->d_angle;
416 float * OPUS_RESTRICT d2A = tonal->d2_angle;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400417 VARDECL(float, tonality);
418 VARDECL(float, noisiness);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500419 float band_tonality[NB_TBANDS];
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800420 float logE[NB_TBANDS];
421 float BFCC[8];
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400422 float features[25];
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500423 float frame_tonality;
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500424 float max_frame_tonality;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500425 /*float tw_sum=0;*/
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800426 float frame_noisiness;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500427 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500428 float slope=0;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800429 float frame_stationarity;
430 float relativeE;
Jean-Marc Valin742aac12013-02-22 16:44:56 -0500431 float frame_probs[2];
Jean-Marc Valina2054572011-11-25 23:07:46 -0500432 float alpha, alphaE, alphaE2;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500433 float frame_loudness;
Jean-Marc Valina2054572011-11-25 23:07:46 -0500434 float bandwidth_mask;
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400435 int is_masked[NB_TBANDS+1];
Jean-Marc Valina2054572011-11-25 23:07:46 -0500436 int bandwidth=0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500437 float maxE = 0;
438 float noise_floor;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500439 int remaining;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500440 AnalysisInfo *info;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500441 float hp_ener;
442 float tonality2[240];
443 float midE[8];
444 float spec_variability=0;
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400445 float band_log2[NB_TBANDS+1];
446 float leakage_from[NB_TBANDS+1];
447 float leakage_to[NB_TBANDS+1];
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400448 float layer_out[MAX_NEURONS];
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400449 SAVE_STACK;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500450
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500451 alpha = 1.f/IMIN(10, 1+tonal->count);
452 alphaE = 1.f/IMIN(25, 1+tonal->count);
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400453 /* Noise floor related decay for bandwidth detection: -2.2 dB/second */
454 alphaE2 = 1.f/IMIN(100, 1+tonal->count);
455 if (tonal->count <= 1) alphaE2 = 1;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500456
457 if (tonal->Fs == 48000)
458 {
459 /* len and offset are now at 24 kHz. */
460 len/= 2;
461 offset /= 2;
462 } else if (tonal->Fs == 16000) {
463 len = 3*len/2;
464 offset = 3*offset/2;
465 }
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500466
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500467 kfft = celt_mode->mdct.kfft[0];
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500468 if (tonal->count==0)
469 tonal->mem_fill = 240;
Mark Harrisd6d70372017-02-20 19:51:40 -0800470 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
471 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500472 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500473 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
474 {
475 tonal->mem_fill += len;
476 /* Don't have enough to update the analysis */
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400477 RESTORE_STACK;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500478 return;
479 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500480 hp_ener = tonal->hp_ener_accum;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500481 info = &tonal->info[tonal->write_pos++];
482 if (tonal->write_pos>=DETECT_SIZE)
483 tonal->write_pos-=DETECT_SIZE;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500484
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400485 ALLOC(in, 480, kiss_fft_cpx);
486 ALLOC(out, 480, kiss_fft_cpx);
487 ALLOC(tonality, 240, float);
488 ALLOC(noisiness, 240, float);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500489 for (i=0;i<N2;i++)
490 {
491 float w = analysis_window[i];
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500492 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
493 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
494 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
495 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500496 }
497 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
498 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
Mark Harrisd6d70372017-02-20 19:51:40 -0800499 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
500 &tonal->inmem[240], tonal->downmix_state, remaining,
501 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500502 tonal->mem_fill = 240 + remaining;
Ralph Gilesd43445f2015-12-30 10:00:17 -0800503 opus_fft(kfft, in, out, tonal->arch);
Jean-Marc Valin122971b2013-12-10 13:55:35 -0500504#ifndef FIXED_POINT
505 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
506 if (celt_isnan(out[0].r))
507 {
508 info->valid = 0;
509 RESTORE_STACK;
510 return;
511 }
512#endif
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500513
514 for (i=1;i<N2;i++)
515 {
516 float X1r, X2r, X1i, X2i;
517 float angle, d_angle, d2_angle;
518 float angle2, d_angle2, d2_angle2;
519 float mod1, mod2, avg_mod;
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500520 X1r = (float)out[i].r+out[N-i].r;
521 X1i = (float)out[i].i-out[N-i].i;
522 X2r = (float)out[i].i+out[N-i].i;
523 X2i = (float)out[N-i].r-out[i].r;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800524
Jean-Marc Valind683c762012-12-21 16:17:38 -0500525 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500526 d_angle = angle - A[i];
527 d2_angle = d_angle - dA[i];
528
Jean-Marc Valind683c762012-12-21 16:17:38 -0500529 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500530 d_angle2 = angle2 - angle;
531 d2_angle2 = d_angle2 - d_angle;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500532
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500533 mod1 = d2_angle - (float)float2int(d2_angle);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500534 noisiness[i] = ABS16(mod1);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500535 mod1 *= mod1;
536 mod1 *= mod1;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800537
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500538 mod2 = d2_angle2 - (float)float2int(d2_angle2);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500539 noisiness[i] += ABS16(mod2);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500540 mod2 *= mod2;
541 mod2 *= mod2;
542
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500543 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
544 /* This introduces an extra delay of 2 frames in the detection. */
Jean-Marc Valind683c762012-12-21 16:17:38 -0500545 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500546 /* No delay on this detection, but it's less reliable. */
547 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500548
549 A[i] = angle2;
550 dA[i] = d_angle2;
551 d2A[i] = mod2;
552 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500553 for (i=2;i<N2-1;i++)
554 {
555 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
Mark Harrisd6d70372017-02-20 19:51:40 -0800556 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500557 }
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500558 frame_tonality = 0;
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500559 max_frame_tonality = 0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500560 /*tw_sum = 0;*/
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800561 info->activity = 0;
562 frame_noisiness = 0;
563 frame_stationarity = 0;
564 if (!tonal->count)
565 {
566 for (b=0;b<NB_TBANDS;b++)
567 {
568 tonal->lowE[b] = 1e10;
569 tonal->highE[b] = -1e10;
570 }
571 }
572 relativeE = 0;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500573 frame_loudness = 0;
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400574 /* The energy of the very first band is special because of DC. */
575 {
576 float E = 0;
577 float X1r, X2r;
578 X1r = 2*(float)out[0].r;
579 X2r = 2*(float)out[0].i;
580 E = X1r*X1r + X2r*X2r;
581 for (i=1;i<4;i++)
582 {
583 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
584 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
585 E += binE;
586 }
587 E = SCALE_ENER(E);
Jean-Marc Valin7be054b2017-06-01 13:21:59 -0400588 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400589 }
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500590 for (b=0;b<NB_TBANDS;b++)
591 {
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800592 float E=0, tE=0, nE=0;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500593 float L1, L2;
594 float stationarity;
595 for (i=tbands[b];i<tbands[b+1];i++)
596 {
Jean-Marc Valin3ab03e02013-09-06 16:00:39 -0400597 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
598 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400599 binE = SCALE_ENER(binE);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500600 E += binE;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500601 tE += binE*MAX32(0, tonality[i]);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500602 nE += binE*2.f*(.5f-noisiness[i]);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500603 }
Jean-Marc Valin122971b2013-12-10 13:55:35 -0500604#ifndef FIXED_POINT
605 /* Check for extreme band energies that could cause NaNs later. */
606 if (!(E<1e9f) || celt_isnan(E))
607 {
608 info->valid = 0;
609 RESTORE_STACK;
610 return;
611 }
612#endif
613
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500614 tonal->E[tonal->E_count][b] = E;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500615 frame_noisiness += nE/(1e-15f+E);
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800616
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500617 frame_loudness += (float)sqrt(E+1e-10f);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500618 logE[b] = (float)log(E+1e-10f);
Jean-Marc Valin7be054b2017-06-01 13:21:59 -0400619 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500620 tonal->logE[tonal->E_count][b] = logE[b];
621 if (tonal->count==0)
622 tonal->highE[b] = tonal->lowE[b] = logE[b];
623 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800624 {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500625 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
Mark Harrisd6d70372017-02-20 19:51:40 -0800626 tonal->highE[b] -= .01f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500627 else
Mark Harrisd6d70372017-02-20 19:51:40 -0800628 tonal->lowE[b] += .01f;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800629 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500630 if (logE[b] > tonal->highE[b])
631 {
632 tonal->highE[b] = logE[b];
633 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
634 } else if (logE[b] < tonal->lowE[b])
635 {
636 tonal->lowE[b] = logE[b];
637 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
638 }
639 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800640
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500641 L1=L2=0;
642 for (i=0;i<NB_FRAMES;i++)
643 {
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500644 L1 += (float)sqrt(tonal->E[i][b]);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500645 L2 += tonal->E[i][b];
646 }
647
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500648 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500649 stationarity *= stationarity;
650 stationarity *= stationarity;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800651 frame_stationarity += stationarity;
652 /*band_tonality[b] = tE/(1e-15+E)*/;
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500653 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500654#if 0
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800655 if (b>=NB_TONAL_SKIP_BANDS)
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500656 {
657 frame_tonality += tweight[b]*band_tonality[b];
658 tw_sum += tweight[b];
659 }
660#else
661 frame_tonality += band_tonality[b];
662 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
663 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
664#endif
Jean-Marc Valind683c762012-12-21 16:17:38 -0500665 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500666 slope += band_tonality[b]*(b-8);
Jean-Marc Valin70d90d12011-11-28 14:17:47 -0500667 /*printf("%f %f ", band_tonality[b], stationarity);*/
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500668 tonal->prev_band_tonality[b] = band_tonality[b];
669 }
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500670
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400671 leakage_from[0] = band_log2[0];
672 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
673 for (b=1;b<NB_TBANDS+1;b++)
674 {
675 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
676 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
677 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
678 }
679 for (b=NB_TBANDS-2;b>=0;b--)
680 {
681 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
682 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
683 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
684 }
685 celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
686 for (b=0;b<NB_TBANDS+1;b++)
687 {
Jean-Marc Valincf86d252017-06-01 17:56:31 -0400688 /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400689 represents the boost needed to overcome the amount of analysis leakage
Jean-Marc Valincf86d252017-06-01 17:56:31 -0400690 cause in a weaker band b by louder neighbouring bands.
691 The second, based on leakage_from[], applies to a loud band b for
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400692 which the quantization noise causes synthesis leakage to the weaker
693 neighbouring bands. */
694 float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
695 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
696 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
697 }
698 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
699
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500700 for (i=0;i<NB_FRAMES;i++)
701 {
702 int j;
Mark Harrisd6d70372017-02-20 19:51:40 -0800703 float mindist = 1e15f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500704 for (j=0;j<NB_FRAMES;j++)
705 {
706 int k;
707 float dist=0;
708 for (k=0;k<NB_TBANDS;k++)
709 {
710 float tmp;
711 tmp = tonal->logE[i][k] - tonal->logE[j][k];
712 dist += tmp*tmp;
713 }
714 if (j!=i)
715 mindist = MIN32(mindist, dist);
716 }
717 spec_variability += mindist;
718 }
Mark Harrisd6d70372017-02-20 19:51:40 -0800719 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500720 bandwidth_mask = 0;
721 bandwidth = 0;
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400722 maxE = 0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500723 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
724 noise_floor *= noise_floor;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500725 for (b=0;b<NB_TBANDS;b++)
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500726 {
727 float E=0;
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400728 float Em;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500729 int band_start, band_end;
730 /* Keep a margin of 300 Hz for aliasing */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500731 band_start = tbands[b];
732 band_end = tbands[b+1];
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500733 for (i=band_start;i<band_end;i++)
734 {
Jean-Marc Valin3ab03e02013-09-06 16:00:39 -0400735 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
736 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500737 E += binE;
738 }
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400739 E = SCALE_ENER(E);
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500740 maxE = MAX32(maxE, E);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400741 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400742 Em = MAX32(E, tonal->meanE[b]);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400743 /* Consider the band "active" only if all these conditions are met:
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400744 1) less than 90 dB below the peak band (maximal masking possible considering
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400745 both the ATH and the loudness-dependent slope of the spreading function)
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400746 2) above the PCM quantization noise floor
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500747 We use b+1 because the first CELT band isn't included in tbands[]
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400748 */
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400749 if (E*1e9f > maxE && (Em > 3*noise_floor*(band_end-band_start) || E > noise_floor*(band_end-band_start)))
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500750 bandwidth = b+1;
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400751 /* Check if the band is masked (see below). */
752 is_masked[b] = E < (tonal->prev_bandwidth >= b+1 ? .01f : .05f)*bandwidth_mask;
753 /* Use a simple follower with 13 dB/Bark slope for spreading function. */
754 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500755 }
756 /* Special case for the last two bands, for which we don't have spectrum but only
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400757 the energy above 12 kHz. The difficulty here is that the high-pass we use
758 leaks some LF energy, so we need to increase the threshold without accidentally cutting
759 off the band. */
Jean-Marc Valindbe22f12017-06-26 13:02:23 -0400760 if (tonal->Fs == 48000) {
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400761 float noise_ratio;
762 float Em;
763 float E = hp_ener*(1.f/(60*60));
764 noise_ratio = tonal->prev_bandwidth==20 ? 10.f : 30.f;
765
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500766#ifdef FIXED_POINT
767 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400768 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500769#endif
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500770 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400771 Em = MAX32(E, tonal->meanE[b]);
772 if (Em > 3*noise_ratio*noise_floor*160 || E > noise_ratio*noise_floor*160)
Jean-Marc Valindbe22f12017-06-26 13:02:23 -0400773 bandwidth = 20;
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400774 /* Check if the band is masked (see below). */
775 is_masked[b] = E < (tonal->prev_bandwidth == 20 ? .01f : .05f)*bandwidth_mask;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500776 }
Jean-Marc Valinb30f45b2017-10-06 16:45:41 -0400777 /* In some cases, resampling aliasing can create a small amount of energy in the first band
778 being cut. So if the last band is masked, we don't include it. */
779 if (bandwidth == 20 && is_masked[NB_TBANDS])
780 bandwidth-=2;
781 else if (bandwidth > 0 && bandwidth <= NB_TBANDS && is_masked[bandwidth-1])
782 bandwidth--;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500783 if (tonal->count<=2)
784 bandwidth = 20;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500785 frame_loudness = 20*(float)log10(frame_loudness);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500786 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500787 tonal->lowECount *= (1-alphaE);
788 if (frame_loudness < tonal->Etracker-30)
789 tonal->lowECount += alphaE;
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800790
791 for (i=0;i<8;i++)
792 {
793 float sum=0;
794 for (b=0;b<16;b++)
795 sum += dct_table[i*16+b]*logE[b];
796 BFCC[i] = sum;
797 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500798 for (i=0;i<8;i++)
799 {
800 float sum=0;
801 for (b=0;b<16;b++)
Mark Harrisd6d70372017-02-20 19:51:40 -0800802 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500803 midE[i] = sum;
804 }
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800805
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800806 frame_stationarity /= NB_TBANDS;
807 relativeE /= NB_TBANDS;
808 if (tonal->count<10)
Jean-Marc Valin5bb3cbc2017-06-19 17:55:56 -0400809 relativeE = .5f;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800810 frame_noisiness /= NB_TBANDS;
811#if 1
812 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
813#else
814 info->activity = .5*(1+frame_noisiness-frame_stationarity);
815#endif
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500816 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
Jean-Marc Valind683c762012-12-21 16:17:38 -0500817 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500818 tonal->prev_tonality = frame_tonality;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500819
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500820 slope /= 8*8;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800821 info->tonality_slope = slope;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500822
823 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500824 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800825 info->tonality = frame_tonality;
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800826
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500827 for (i=0;i<4;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500828 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500829
Jean-Marc Valincd213ea2011-11-21 21:57:10 -0500830 for (i=0;i<4;i++)
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500831 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
832
833 for (i=0;i<4;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500834 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500835 for (i=0;i<3;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500836 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
Jean-Marc Valincd213ea2011-11-21 21:57:10 -0500837
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500838 if (tonal->count > 5)
839 {
840 for (i=0;i<9;i++)
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500841 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500842 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500843 for (i=0;i<4;i++)
844 features[i] = BFCC[i]-midE[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500845
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800846 for (i=0;i<8;i++)
847 {
848 tonal->mem[i+24] = tonal->mem[i+16];
849 tonal->mem[i+16] = tonal->mem[i+8];
850 tonal->mem[i+8] = tonal->mem[i];
851 tonal->mem[i] = BFCC[i];
852 }
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500853 for (i=0;i<9;i++)
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500854 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
Mark Harrisd6d70372017-02-20 19:51:40 -0800855 features[18] = spec_variability - 0.78f;
856 features[20] = info->tonality - 0.154723f;
857 features[21] = info->activity - 0.724643f;
858 features[22] = frame_stationarity - 0.743717f;
859 features[23] = info->tonality_slope + 0.069216f;
860 features[24] = tonal->lowECount - 0.067930f;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500861
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400862 compute_dense(&layer0, layer_out, features);
863 compute_gru(&layer1, tonal->rnn_state, layer_out);
864 compute_dense(&layer2, frame_probs, tonal->rnn_state);
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500865
Felicia Lim36481342016-05-16 15:29:53 +0200866 /* Probability of speech or music vs noise */
867 info->activity_probability = frame_probs[1];
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400868 /* It seems like the RNN tends to have a bias towards speech and this
869 warping of the probabilities compensates for it. */
Jean-Marc Valinff982022017-11-08 20:35:30 -0500870 info->music_prob = MAX16(1-10*(1-frame_probs[0]), MIN16(10*frame_probs[0], .12+.69*frame_probs[0]*(2-frame_probs[0])));
Felicia Lim36481342016-05-16 15:29:53 +0200871
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400872 /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500873#ifdef MLP_TRAINING
874 for (i=0;i<25;i++)
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800875 printf("%f ", features[i]);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500876 printf("\n");
877#endif
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800878
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500879 info->bandwidth = bandwidth;
Jean-Marc Valindbe22f12017-06-26 13:02:23 -0400880 tonal->prev_bandwidth = bandwidth;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500881 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
Jean-Marc Valin2a9fdbc2011-12-07 17:39:40 -0500882 info->noisiness = frame_noisiness;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800883 info->valid = 1;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400884 RESTORE_STACK;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500885}
886
Jean-Marc Valina4c25122013-09-28 17:22:41 -0400887void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
Jean-Marc Valinb90e63b2013-09-16 13:08:52 -0400888 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
Ralph Gilesd43445f2015-12-30 10:00:17 -0800889 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500890{
891 int offset;
892 int pcm_len;
893
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500894 analysis_frame_size -= analysis_frame_size&1;
Jean-Marc Valin29254442013-09-28 19:29:23 -0400895 if (analysis_pcm != NULL)
896 {
897 /* Avoid overflow/wrap-around of the analysis buffer */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500898 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500899
Jean-Marc Valin29254442013-09-28 19:29:23 -0400900 pcm_len = analysis_frame_size - analysis->analysis_offset;
901 offset = analysis->analysis_offset;
Jean-Marc Valindbff5fc2016-09-07 11:20:06 -0400902 while (pcm_len>0) {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500903 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
904 offset += Fs/50;
905 pcm_len -= Fs/50;
Jean-Marc Valindbff5fc2016-09-07 11:20:06 -0400906 }
Jean-Marc Valin29254442013-09-28 19:29:23 -0400907 analysis->analysis_offset = analysis_frame_size;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500908
Jean-Marc Valin29254442013-09-28 19:29:23 -0400909 analysis->analysis_offset -= frame_size;
910 }
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500911
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500912 analysis_info->valid = 0;
913 tonality_get_info(analysis, analysis_info, frame_size);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500914}
Mark Harrisd6d70372017-02-20 19:51:40 -0800915
916#endif /* DISABLE_FLOAT_API */