blob: 1d6dd8291a251b9d2d8581deca7d431ae3421ce7 [file] [log] [blame]
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -05001/* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3/*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
Jean-Marc Valin2ff65562016-08-11 11:05:51 -040032#define ANALYSIS_C
33
34#include <stdio.h>
35
36#include "mathops.h"
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -050037#include "kiss_fft.h"
38#include "celt.h"
39#include "modes.h"
40#include "arch.h"
41#include "quant_bands.h"
Jean-Marc Valinc5880fe2012-07-05 01:28:45 -040042#include "analysis.h"
43#include "mlp.h"
Jean-Marc Valina8783a12013-04-15 15:49:40 -040044#include "stack_alloc.h"
Jean-Marc Valincf9409f2016-12-16 17:52:15 -050045#include "float_cast.h"
Jean-Marc Valinc5880fe2012-07-05 01:28:45 -040046
Jean-Marc Valin73eb3632011-11-16 10:47:15 +080047#ifndef M_PI
48#define M_PI 3.141592653
49#endif
50
Mark Harrisd6d70372017-02-20 19:51:40 -080051#ifndef DISABLE_FLOAT_API
52
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -040053#define TRANSITION_PENALTY 10
54
Jean-Marc Valin027ff072012-09-14 15:49:18 -040055static const float dct_table[128] = {
Jean-Marc Valind683c762012-12-21 16:17:38 -050056 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
57 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
58 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
59 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
60 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
61 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
62 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
63 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
64 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
65 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
66 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
67 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
68 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
69 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
70 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
71 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
Jean-Marc Valin73eb3632011-11-16 10:47:15 +080072};
73
Jean-Marc Valin027ff072012-09-14 15:49:18 -040074static const float analysis_window[240] = {
Jean-Marc Valinb15b30c2012-07-04 12:08:22 -040075 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
76 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
77 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
78 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
79 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
80 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
81 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
82 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
83 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
84 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
85 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
86 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
87 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
88 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
89 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
90 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
91 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
92 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
93 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
94 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
95 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
96 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
97 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
98 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
99 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
100 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
101 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
102 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
103 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
104 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
105};
106
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500107static const int tbands[NB_TBANDS+1] = {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500108 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500109};
110
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500111#define NB_TONAL_SKIP_BANDS 9
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800112
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500113static opus_val32 silk_resampler_down2_hp(
114 opus_val32 *S, /* I/O State vector [ 2 ] */
115 opus_val32 *out, /* O Output signal [ floor(len/2) ] */
116 const opus_val32 *in, /* I Input signal [ len ] */
117 int inLen /* I Number of input samples */
118)
119{
120 int k, len2 = inLen/2;
121 opus_val32 in32, out32, out32_hp, Y, X;
122 opus_val64 hp_ener = 0;
123 /* Internal variables and state are in Q10 format */
124 for( k = 0; k < len2; k++ ) {
125 /* Convert to Q10 */
126 in32 = in[ 2 * k ];
Jean-Marc Valinb15b30c2012-07-04 12:08:22 -0400127
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500128 /* All-pass section for even input sample */
129 Y = SUB32( in32, S[ 0 ] );
130 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
131 out32 = ADD32( S[ 0 ], X );
132 S[ 0 ] = ADD32( in32, X );
133 out32_hp = out32;
134 /* Convert to Q10 */
135 in32 = in[ 2 * k + 1 ];
136
137 /* All-pass section for odd input sample, and add to output of previous section */
138 Y = SUB32( in32, S[ 1 ] );
139 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
140 out32 = ADD32( out32, S[ 1 ] );
141 out32 = ADD32( out32, X );
142 S[ 1 ] = ADD32( in32, X );
143
144 Y = SUB32( -in32, S[ 2 ] );
145 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
146 out32_hp = ADD32( out32_hp, S[ 2 ] );
147 out32_hp = ADD32( out32_hp, X );
148 S[ 2 ] = ADD32( -in32, X );
149
150 hp_ener += out32_hp*(opus_val64)out32_hp;
151 /* Add, convert back to int16 and store to output */
152 out[ k ] = HALF32(out32);
153 }
154#ifdef FIXED_POINT
155 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
156 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
157#endif
Mark Harrisd6d70372017-02-20 19:51:40 -0800158 return (opus_val32)hp_ener;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500159}
160
161static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
162{
163 VARDECL(opus_val32, tmp);
164 opus_val32 scale;
165 int j;
166 opus_val32 ret = 0;
167 SAVE_STACK;
168
169 if (subframe==0) return 0;
170 if (Fs == 48000)
171 {
172 subframe *= 2;
173 offset *= 2;
174 } else if (Fs == 16000) {
175 subframe = subframe*2/3;
176 offset = offset*2/3;
177 }
178 ALLOC(tmp, subframe, opus_val32);
179
180 downmix(_x, tmp, subframe, offset, c1, c2, C);
181#ifdef FIXED_POINT
182 scale = (1<<SIG_SHIFT);
183#else
184 scale = 1.f/32768;
185#endif
186 if (c2==-2)
187 scale /= C;
188 else if (c2>-1)
189 scale /= 2;
190 for (j=0;j<subframe;j++)
191 tmp[j] *= scale;
192 if (Fs == 48000)
193 {
194 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
195 } else if (Fs == 24000) {
196 OPUS_COPY(y, tmp, subframe);
197 } else if (Fs == 16000) {
198 VARDECL(opus_val32, tmp3x);
199 ALLOC(tmp3x, 3*subframe, opus_val32);
200 /* Don't do this at home! This resampler is horrible and it's only (barely)
201 usable for the purpose of the analysis because we don't care about all
202 the aliasing between 8 kHz and 12 kHz. */
203 for (j=0;j<subframe;j++)
204 {
205 tmp3x[3*j] = tmp[j];
206 tmp3x[3*j+1] = tmp[j];
207 tmp3x[3*j+2] = tmp[j];
208 }
209 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
210 }
211 RESTORE_STACK;
212 return ret;
213}
214
215void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
Ralph Gilesd43445f2015-12-30 10:00:17 -0800216{
217 /* Initialize reusable fields. */
218 tonal->arch = opus_select_arch();
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500219 tonal->Fs = Fs;
Ralph Giles9e3872a2015-12-30 15:27:02 -0800220 /* Clear remaining fields. */
221 tonality_analysis_reset(tonal);
222}
223
224void tonality_analysis_reset(TonalityAnalysisState *tonal)
225{
226 /* Clear non-reusable fields. */
227 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
228 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
Ralph Gilesd43445f2015-12-30 10:00:17 -0800229}
230
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500231void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
232{
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500233 int pos;
234 int curr_lookahead;
Jean-Marc Valin41443422017-06-09 22:16:53 -0400235 float tonality_max;
236 float tonality_avg;
237 int tonality_count;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500238 int i;
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400239 int pos0;
240 float prob_avg;
241 float prob_count;
242 float prob_min, prob_max;
243 float vad_prob;
244 int mpos, vpos;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500245
246 pos = tonal->read_pos;
247 curr_lookahead = tonal->write_pos-tonal->read_pos;
248 if (curr_lookahead<0)
249 curr_lookahead += DETECT_SIZE;
250
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500251 /* On long frames, look at the second analysis window rather than the first. */
252 if (len > tonal->Fs/50 && pos != tonal->write_pos)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500253 {
254 pos++;
255 if (pos==DETECT_SIZE)
256 pos=0;
257 }
258 if (pos == tonal->write_pos)
259 pos--;
260 if (pos<0)
261 pos = DETECT_SIZE-1;
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400262 pos0 = pos;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500263 OPUS_COPY(info_out, &tonal->info[pos], 1);
Jean-Marc Valin41443422017-06-09 22:16:53 -0400264 tonality_max = tonality_avg = info_out->tonality;
265 tonality_count = 1;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500266 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
267 for (i=0;i<3;i++)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500268 {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500269 pos++;
270 if (pos==DETECT_SIZE)
271 pos = 0;
272 if (pos == tonal->write_pos)
273 break;
Jean-Marc Valin41443422017-06-09 22:16:53 -0400274 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
275 tonality_avg += tonal->info[pos].tonality;
276 tonality_count++;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500277 }
Jean-Marc Valin5bb3cbc2017-06-19 17:55:56 -0400278 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400279
280 mpos = vpos = pos0;
281 /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and
282 ~1 frame delay in the VAD prob. */
283 if (curr_lookahead > 15)
284 {
285 mpos += 5;
286 if (mpos>=DETECT_SIZE)
287 mpos -= DETECT_SIZE;
288 vpos += 1;
289 if (vpos>=DETECT_SIZE)
290 vpos -= DETECT_SIZE;
291 }
292
293 /* The following calculations attempt to minimize a "badness function"
294 for the transition. When switching from speech to music, the badness
295 of switching at frame k is
296 b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
297 where
298 v_i is the activity probability (VAD) at frame i,
299 p_i is the music probability at frame i
300 T is the probability threshold for switching
301 S is the penalty for switching during active audio rather than silence
302 the current frame has index i=0
303
304 Rather than apply badness to directly decide when to switch, what we compute
305 instead is the threshold for which the optimal switching point is now. When
306 considering whether to switch now (frame 0) or at frame k, we have:
307 S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
308 which gives us:
309 T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i )
310 We take the min threshold across all positive values of k (up to the maximum
311 amount of lookahead we have) to give us the threshold for which the current
312 frame is the optimal switch point.
313
314 The last step is that we need to consider whether we want to switch at all.
315 For that we use the average of the music probability over the entire window.
316 If the threshold is higher than that average we're not going to
317 switch, so we compute a min with the average as well. The result of all these
318 min operations is music_prob_min, which gives the threshold for switching to music
319 if we're currently encoding for speech.
320
321 We do the exact opposite to compute music_prob_max which is used for switching
322 from music to speech.
323 */
324 prob_min = 1.f;
325 prob_max = 0.f;
326 vad_prob = tonal->info[vpos].activity_probability;
327 prob_count = MAX16(.1f, vad_prob);
328 prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob;
329 while (1)
330 {
331 float pos_vad;
332 mpos++;
333 if (mpos==DETECT_SIZE)
334 mpos = 0;
335 if (mpos == tonal->write_pos)
336 break;
337 vpos++;
338 if (vpos==DETECT_SIZE)
339 vpos = 0;
340 if (vpos == tonal->write_pos)
341 break;
342 pos_vad = tonal->info[vpos].activity_probability;
343 prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min);
344 prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max);
345 prob_count += MAX16(.1f, pos_vad);
346 prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob;
347 }
348 info_out->music_prob = prob_avg/prob_count;
349 prob_min = MIN16(prob_avg/prob_count, prob_min);
350 prob_max = MAX16(prob_avg/prob_count, prob_max);
351 prob_min = MAX16(prob_min, 0.f);
352 prob_max = MIN16(prob_max, 1.f);
353
354 /* If we don't have enough look-ahead, do our best to make a decent decision. */
355 if (curr_lookahead < 10)
356 {
357 float pmin, pmax;
358 pmin = prob_min;
359 pmax = prob_max;
360 pos = pos0;
361 /* Look for min/max in the past. */
362 for (i=0;i<IMIN(tonal->count-1, 15);i++)
363 {
364 pos--;
365 if (pos < 0)
366 pos = DETECT_SIZE-1;
367 pmin = MIN16(pmin, tonal->info[pos].music_prob);
368 pmax = MAX16(pmax, tonal->info[pos].music_prob);
369 }
370 /* Bias against switching on active audio. */
371 pmin = MAX16(0.f, pmin - .1f*vad_prob);
372 pmax = MIN16(1.f, pmax + .1f*vad_prob);
373 prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min);
374 prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max);
375 }
376 info_out->music_prob_min = prob_min;
377 info_out->music_prob_max = prob_max;
378
379 /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500380 tonal->read_subframe += len/(tonal->Fs/400);
381 while (tonal->read_subframe>=8)
382 {
383 tonal->read_subframe -= 8;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500384 tonal->read_pos++;
385 }
386 if (tonal->read_pos>=DETECT_SIZE)
387 tonal->read_pos-=DETECT_SIZE;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500388}
389
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500390static const float std_feature_bias[9] = {
Mark Harrisd6d70372017-02-20 19:51:40 -0800391 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
392 2.163313f, 1.260756f, 1.116868f, 1.918795f
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500393};
394
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400395#define LEAKAGE_OFFSET 2.5f
396#define LEAKAGE_SLOPE 2.f
397
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400398#ifdef FIXED_POINT
399/* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
400 compensate for that in the energy. */
401#define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
402#define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
403#else
404#define SCALE_ENER(e) (e)
405#endif
406
Ralph Gilesd43445f2015-12-30 10:00:17 -0800407static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500408{
409 int i, b;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500410 const kiss_fft_state *kfft;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400411 VARDECL(kiss_fft_cpx, in);
412 VARDECL(kiss_fft_cpx, out);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500413 int N = 480, N2=240;
Joshua Bowman3b60e812012-10-23 12:18:38 -0700414 float * OPUS_RESTRICT A = tonal->angle;
415 float * OPUS_RESTRICT dA = tonal->d_angle;
416 float * OPUS_RESTRICT d2A = tonal->d2_angle;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400417 VARDECL(float, tonality);
418 VARDECL(float, noisiness);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500419 float band_tonality[NB_TBANDS];
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800420 float logE[NB_TBANDS];
421 float BFCC[8];
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400422 float features[25];
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500423 float frame_tonality;
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500424 float max_frame_tonality;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500425 /*float tw_sum=0;*/
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800426 float frame_noisiness;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500427 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500428 float slope=0;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800429 float frame_stationarity;
430 float relativeE;
Jean-Marc Valin742aac12013-02-22 16:44:56 -0500431 float frame_probs[2];
Jean-Marc Valina2054572011-11-25 23:07:46 -0500432 float alpha, alphaE, alphaE2;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500433 float frame_loudness;
Jean-Marc Valina2054572011-11-25 23:07:46 -0500434 float bandwidth_mask;
435 int bandwidth=0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500436 float maxE = 0;
437 float noise_floor;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500438 int remaining;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500439 AnalysisInfo *info;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500440 float hp_ener;
441 float tonality2[240];
442 float midE[8];
443 float spec_variability=0;
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400444 float band_log2[NB_TBANDS+1];
445 float leakage_from[NB_TBANDS+1];
446 float leakage_to[NB_TBANDS+1];
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400447 float layer_out[MAX_NEURONS];
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400448 SAVE_STACK;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500449
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500450 alpha = 1.f/IMIN(10, 1+tonal->count);
451 alphaE = 1.f/IMIN(25, 1+tonal->count);
452 alphaE2 = 1.f/IMIN(500, 1+tonal->count);
453
454 if (tonal->Fs == 48000)
455 {
456 /* len and offset are now at 24 kHz. */
457 len/= 2;
458 offset /= 2;
459 } else if (tonal->Fs == 16000) {
460 len = 3*len/2;
461 offset = 3*offset/2;
462 }
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500463
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500464 kfft = celt_mode->mdct.kfft[0];
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500465 if (tonal->count==0)
466 tonal->mem_fill = 240;
Mark Harrisd6d70372017-02-20 19:51:40 -0800467 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
468 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500469 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500470 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
471 {
472 tonal->mem_fill += len;
473 /* Don't have enough to update the analysis */
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400474 RESTORE_STACK;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500475 return;
476 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500477 hp_ener = tonal->hp_ener_accum;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500478 info = &tonal->info[tonal->write_pos++];
479 if (tonal->write_pos>=DETECT_SIZE)
480 tonal->write_pos-=DETECT_SIZE;
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500481
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400482 ALLOC(in, 480, kiss_fft_cpx);
483 ALLOC(out, 480, kiss_fft_cpx);
484 ALLOC(tonality, 240, float);
485 ALLOC(noisiness, 240, float);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500486 for (i=0;i<N2;i++)
487 {
488 float w = analysis_window[i];
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500489 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
490 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
491 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
492 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500493 }
494 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
495 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
Mark Harrisd6d70372017-02-20 19:51:40 -0800496 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
497 &tonal->inmem[240], tonal->downmix_state, remaining,
498 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
Jean-Marc Valin48ac1222012-11-14 02:39:27 -0500499 tonal->mem_fill = 240 + remaining;
Ralph Gilesd43445f2015-12-30 10:00:17 -0800500 opus_fft(kfft, in, out, tonal->arch);
Jean-Marc Valin122971b2013-12-10 13:55:35 -0500501#ifndef FIXED_POINT
502 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
503 if (celt_isnan(out[0].r))
504 {
505 info->valid = 0;
506 RESTORE_STACK;
507 return;
508 }
509#endif
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500510
511 for (i=1;i<N2;i++)
512 {
513 float X1r, X2r, X1i, X2i;
514 float angle, d_angle, d2_angle;
515 float angle2, d_angle2, d2_angle2;
516 float mod1, mod2, avg_mod;
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500517 X1r = (float)out[i].r+out[N-i].r;
518 X1i = (float)out[i].i-out[N-i].i;
519 X2r = (float)out[i].i+out[N-i].i;
520 X2i = (float)out[N-i].r-out[i].r;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800521
Jean-Marc Valind683c762012-12-21 16:17:38 -0500522 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500523 d_angle = angle - A[i];
524 d2_angle = d_angle - dA[i];
525
Jean-Marc Valind683c762012-12-21 16:17:38 -0500526 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500527 d_angle2 = angle2 - angle;
528 d2_angle2 = d_angle2 - d_angle;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500529
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500530 mod1 = d2_angle - (float)float2int(d2_angle);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500531 noisiness[i] = ABS16(mod1);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500532 mod1 *= mod1;
533 mod1 *= mod1;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800534
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500535 mod2 = d2_angle2 - (float)float2int(d2_angle2);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500536 noisiness[i] += ABS16(mod2);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500537 mod2 *= mod2;
538 mod2 *= mod2;
539
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500540 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
541 /* This introduces an extra delay of 2 frames in the detection. */
Jean-Marc Valind683c762012-12-21 16:17:38 -0500542 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500543 /* No delay on this detection, but it's less reliable. */
544 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500545
546 A[i] = angle2;
547 dA[i] = d_angle2;
548 d2A[i] = mod2;
549 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500550 for (i=2;i<N2-1;i++)
551 {
552 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
Mark Harrisd6d70372017-02-20 19:51:40 -0800553 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500554 }
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500555 frame_tonality = 0;
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500556 max_frame_tonality = 0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500557 /*tw_sum = 0;*/
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800558 info->activity = 0;
559 frame_noisiness = 0;
560 frame_stationarity = 0;
561 if (!tonal->count)
562 {
563 for (b=0;b<NB_TBANDS;b++)
564 {
565 tonal->lowE[b] = 1e10;
566 tonal->highE[b] = -1e10;
567 }
568 }
569 relativeE = 0;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500570 frame_loudness = 0;
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400571 /* The energy of the very first band is special because of DC. */
572 {
573 float E = 0;
574 float X1r, X2r;
575 X1r = 2*(float)out[0].r;
576 X2r = 2*(float)out[0].i;
577 E = X1r*X1r + X2r*X2r;
578 for (i=1;i<4;i++)
579 {
580 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
581 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
582 E += binE;
583 }
584 E = SCALE_ENER(E);
Jean-Marc Valin7be054b2017-06-01 13:21:59 -0400585 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400586 }
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500587 for (b=0;b<NB_TBANDS;b++)
588 {
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800589 float E=0, tE=0, nE=0;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500590 float L1, L2;
591 float stationarity;
592 for (i=tbands[b];i<tbands[b+1];i++)
593 {
Jean-Marc Valin3ab03e02013-09-06 16:00:39 -0400594 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
595 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400596 binE = SCALE_ENER(binE);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500597 E += binE;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500598 tE += binE*MAX32(0, tonality[i]);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500599 nE += binE*2.f*(.5f-noisiness[i]);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500600 }
Jean-Marc Valin122971b2013-12-10 13:55:35 -0500601#ifndef FIXED_POINT
602 /* Check for extreme band energies that could cause NaNs later. */
603 if (!(E<1e9f) || celt_isnan(E))
604 {
605 info->valid = 0;
606 RESTORE_STACK;
607 return;
608 }
609#endif
610
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500611 tonal->E[tonal->E_count][b] = E;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500612 frame_noisiness += nE/(1e-15f+E);
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800613
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500614 frame_loudness += (float)sqrt(E+1e-10f);
Jean-Marc Valind683c762012-12-21 16:17:38 -0500615 logE[b] = (float)log(E+1e-10f);
Jean-Marc Valin7be054b2017-06-01 13:21:59 -0400616 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500617 tonal->logE[tonal->E_count][b] = logE[b];
618 if (tonal->count==0)
619 tonal->highE[b] = tonal->lowE[b] = logE[b];
620 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800621 {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500622 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
Mark Harrisd6d70372017-02-20 19:51:40 -0800623 tonal->highE[b] -= .01f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500624 else
Mark Harrisd6d70372017-02-20 19:51:40 -0800625 tonal->lowE[b] += .01f;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800626 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500627 if (logE[b] > tonal->highE[b])
628 {
629 tonal->highE[b] = logE[b];
630 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
631 } else if (logE[b] < tonal->lowE[b])
632 {
633 tonal->lowE[b] = logE[b];
634 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
635 }
636 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800637
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500638 L1=L2=0;
639 for (i=0;i<NB_FRAMES;i++)
640 {
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500641 L1 += (float)sqrt(tonal->E[i][b]);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500642 L2 += tonal->E[i][b];
643 }
644
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500645 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500646 stationarity *= stationarity;
647 stationarity *= stationarity;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800648 frame_stationarity += stationarity;
649 /*band_tonality[b] = tE/(1e-15+E)*/;
Jean-Marc Valina71c9ad2013-11-13 12:07:01 -0500650 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500651#if 0
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800652 if (b>=NB_TONAL_SKIP_BANDS)
Jean-Marc Valin971b0552011-12-02 16:08:02 -0500653 {
654 frame_tonality += tweight[b]*band_tonality[b];
655 tw_sum += tweight[b];
656 }
657#else
658 frame_tonality += band_tonality[b];
659 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
660 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
661#endif
Jean-Marc Valind683c762012-12-21 16:17:38 -0500662 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500663 slope += band_tonality[b]*(b-8);
Jean-Marc Valin70d90d12011-11-28 14:17:47 -0500664 /*printf("%f %f ", band_tonality[b], stationarity);*/
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500665 tonal->prev_band_tonality[b] = band_tonality[b];
666 }
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500667
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400668 leakage_from[0] = band_log2[0];
669 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
670 for (b=1;b<NB_TBANDS+1;b++)
671 {
672 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
673 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
674 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
675 }
676 for (b=NB_TBANDS-2;b>=0;b--)
677 {
678 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
679 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
680 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
681 }
682 celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
683 for (b=0;b<NB_TBANDS+1;b++)
684 {
Jean-Marc Valincf86d252017-06-01 17:56:31 -0400685 /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400686 represents the boost needed to overcome the amount of analysis leakage
Jean-Marc Valincf86d252017-06-01 17:56:31 -0400687 cause in a weaker band b by louder neighbouring bands.
688 The second, based on leakage_from[], applies to a loud band b for
Jean-Marc Valin0cc4d962017-06-01 03:14:13 -0400689 which the quantization noise causes synthesis leakage to the weaker
690 neighbouring bands. */
691 float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
692 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
693 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
694 }
695 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
696
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500697 for (i=0;i<NB_FRAMES;i++)
698 {
699 int j;
Mark Harrisd6d70372017-02-20 19:51:40 -0800700 float mindist = 1e15f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500701 for (j=0;j<NB_FRAMES;j++)
702 {
703 int k;
704 float dist=0;
705 for (k=0;k<NB_TBANDS;k++)
706 {
707 float tmp;
708 tmp = tonal->logE[i][k] - tonal->logE[j][k];
709 dist += tmp*tmp;
710 }
711 if (j!=i)
712 mindist = MIN32(mindist, dist);
713 }
714 spec_variability += mindist;
715 }
Mark Harrisd6d70372017-02-20 19:51:40 -0800716 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500717 bandwidth_mask = 0;
718 bandwidth = 0;
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400719 maxE = 0;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500720 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
721 noise_floor *= noise_floor;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500722 for (b=0;b<NB_TBANDS;b++)
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500723 {
724 float E=0;
725 int band_start, band_end;
726 /* Keep a margin of 300 Hz for aliasing */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500727 band_start = tbands[b];
728 band_end = tbands[b+1];
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500729 for (i=band_start;i<band_end;i++)
730 {
Jean-Marc Valin3ab03e02013-09-06 16:00:39 -0400731 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
732 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500733 E += binE;
734 }
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400735 E = SCALE_ENER(E);
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500736 maxE = MAX32(maxE, E);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400737 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500738 E = MAX32(E, tonal->meanE[b]);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400739 /* Use a simple follower with 13 dB/Bark slope for spreading function */
Jean-Marc Valind683c762012-12-21 16:17:38 -0500740 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400741 /* Consider the band "active" only if all these conditions are met:
742 1) less than 10 dB below the simple follower
743 2) less than 90 dB below the peak band (maximal masking possible considering
744 both the ATH and the loudness-dependent slope of the spreading function)
745 3) above the PCM quantization noise floor
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500746 We use b+1 because the first CELT band isn't included in tbands[]
Jean-Marc Valin6862b442013-05-13 22:35:09 -0400747 */
Gregory Maxwell5280c712013-07-15 15:51:24 -0700748 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500749 bandwidth = b+1;
750 }
751 /* Special case for the last two bands, for which we don't have spectrum but only
752 the energy above 12 kHz. */
Jean-Marc Valindbe22f12017-06-26 13:02:23 -0400753 if (tonal->Fs == 48000) {
754 float ratio;
Mark Harrisd6d70372017-02-20 19:51:40 -0800755 float E = hp_ener*(1.f/(240*240));
Jean-Marc Valindbe22f12017-06-26 13:02:23 -0400756 ratio = tonal->prev_bandwidth==20 ? 0.03f : 0.07f;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500757#ifdef FIXED_POINT
758 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
Jean-Marc Valina1c2d712017-06-01 03:13:01 -0400759 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500760#endif
761 maxE = MAX32(maxE, E);
762 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
763 E = MAX32(E, tonal->meanE[b]);
764 /* Use a simple follower with 13 dB/Bark slope for spreading function */
765 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
Jean-Marc Valindbe22f12017-06-26 13:02:23 -0400766 if (E>ratio*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)
767 bandwidth = 20;
768 /* This detector is unreliable, so if the bandwidth is close to SWB, assume it's FB. */
769 if (bandwidth >= 17)
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500770 bandwidth = 20;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500771 }
772 if (tonal->count<=2)
773 bandwidth = 20;
Jean-Marc Valind683c762012-12-21 16:17:38 -0500774 frame_loudness = 20*(float)log10(frame_loudness);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500775 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500776 tonal->lowECount *= (1-alphaE);
777 if (frame_loudness < tonal->Etracker-30)
778 tonal->lowECount += alphaE;
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800779
780 for (i=0;i<8;i++)
781 {
782 float sum=0;
783 for (b=0;b<16;b++)
784 sum += dct_table[i*16+b]*logE[b];
785 BFCC[i] = sum;
786 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500787 for (i=0;i<8;i++)
788 {
789 float sum=0;
790 for (b=0;b<16;b++)
Mark Harrisd6d70372017-02-20 19:51:40 -0800791 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500792 midE[i] = sum;
793 }
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800794
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800795 frame_stationarity /= NB_TBANDS;
796 relativeE /= NB_TBANDS;
797 if (tonal->count<10)
Jean-Marc Valin5bb3cbc2017-06-19 17:55:56 -0400798 relativeE = .5f;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800799 frame_noisiness /= NB_TBANDS;
800#if 1
801 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
802#else
803 info->activity = .5*(1+frame_noisiness-frame_stationarity);
804#endif
Jean-Marc Valin0892c162012-01-12 03:44:49 -0500805 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
Jean-Marc Valind683c762012-12-21 16:17:38 -0500806 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500807 tonal->prev_tonality = frame_tonality;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500808
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500809 slope /= 8*8;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800810 info->tonality_slope = slope;
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500811
812 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500813 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800814 info->tonality = frame_tonality;
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800815
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500816 for (i=0;i<4;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500817 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500818
Jean-Marc Valincd213ea2011-11-21 21:57:10 -0500819 for (i=0;i<4;i++)
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500820 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
821
822 for (i=0;i<4;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500823 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500824 for (i=0;i<3;i++)
Jean-Marc Valind683c762012-12-21 16:17:38 -0500825 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
Jean-Marc Valincd213ea2011-11-21 21:57:10 -0500826
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500827 if (tonal->count > 5)
828 {
829 for (i=0;i<9;i++)
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500830 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500831 }
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500832 for (i=0;i<4;i++)
833 features[i] = BFCC[i]-midE[i];
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500834
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800835 for (i=0;i<8;i++)
836 {
837 tonal->mem[i+24] = tonal->mem[i+16];
838 tonal->mem[i+16] = tonal->mem[i+8];
839 tonal->mem[i+8] = tonal->mem[i];
840 tonal->mem[i] = BFCC[i];
841 }
Jean-Marc Valin747c8172011-11-22 22:44:56 -0500842 for (i=0;i<9;i++)
Jean-Marc Valind68e8092016-01-07 15:49:36 -0500843 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
Mark Harrisd6d70372017-02-20 19:51:40 -0800844 features[18] = spec_variability - 0.78f;
845 features[20] = info->tonality - 0.154723f;
846 features[21] = info->activity - 0.724643f;
847 features[22] = frame_stationarity - 0.743717f;
848 features[23] = info->tonality_slope + 0.069216f;
849 features[24] = tonal->lowECount - 0.067930f;
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500850
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400851 compute_dense(&layer0, layer_out, features);
852 compute_gru(&layer1, tonal->rnn_state, layer_out);
853 compute_dense(&layer2, frame_probs, tonal->rnn_state);
Jean-Marc Valin7609b672011-11-23 13:52:44 -0500854
Felicia Lim36481342016-05-16 15:29:53 +0200855 /* Probability of speech or music vs noise */
856 info->activity_probability = frame_probs[1];
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400857 /* It seems like the RNN tends to have a bias towards speech and this
858 warping of the probabilities compensates for it. */
859 info->music_prob = frame_probs[0] * (2 - frame_probs[0]);
Felicia Lim36481342016-05-16 15:29:53 +0200860
Jean-Marc Valinaf93fbd2017-07-12 16:55:28 -0400861 /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500862#ifdef MLP_TRAINING
863 for (i=0;i<25;i++)
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800864 printf("%f ", features[i]);
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500865 printf("\n");
866#endif
Jean-Marc Valin73eb3632011-11-16 10:47:15 +0800867
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500868 info->bandwidth = bandwidth;
Jean-Marc Valindbe22f12017-06-26 13:02:23 -0400869 tonal->prev_bandwidth = bandwidth;
Jean-Marc Valin7509fdb2012-12-20 22:48:35 -0500870 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
Jean-Marc Valin2a9fdbc2011-12-07 17:39:40 -0500871 info->noisiness = frame_noisiness;
Jean-Marc Valine9c353a2011-11-14 17:58:29 +0800872 info->valid = 1;
Jean-Marc Valina8783a12013-04-15 15:49:40 -0400873 RESTORE_STACK;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500874}
875
Jean-Marc Valina4c25122013-09-28 17:22:41 -0400876void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
Jean-Marc Valinb90e63b2013-09-16 13:08:52 -0400877 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
Ralph Gilesd43445f2015-12-30 10:00:17 -0800878 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500879{
880 int offset;
881 int pcm_len;
882
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500883 analysis_frame_size -= analysis_frame_size&1;
Jean-Marc Valin29254442013-09-28 19:29:23 -0400884 if (analysis_pcm != NULL)
885 {
886 /* Avoid overflow/wrap-around of the analysis buffer */
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500887 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500888
Jean-Marc Valin29254442013-09-28 19:29:23 -0400889 pcm_len = analysis_frame_size - analysis->analysis_offset;
890 offset = analysis->analysis_offset;
Jean-Marc Valindbff5fc2016-09-07 11:20:06 -0400891 while (pcm_len>0) {
Jean-Marc Valincf9409f2016-12-16 17:52:15 -0500892 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
893 offset += Fs/50;
894 pcm_len -= Fs/50;
Jean-Marc Valindbff5fc2016-09-07 11:20:06 -0400895 }
Jean-Marc Valin29254442013-09-28 19:29:23 -0400896 analysis->analysis_offset = analysis_frame_size;
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500897
Jean-Marc Valin29254442013-09-28 19:29:23 -0400898 analysis->analysis_offset -= frame_size;
899 }
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500900
Jean-Marc Valin51f4a322013-02-20 04:08:04 -0500901 analysis_info->valid = 0;
902 tonality_get_info(analysis, analysis_info, frame_size);
Jean-Marc Valin1a2e7652011-11-06 23:27:16 -0500903}
Mark Harrisd6d70372017-02-20 19:51:40 -0800904
905#endif /* DISABLE_FLOAT_API */