blob: f9470da225c2ebc99c0bfedd5df7edc521817f4e [file] [log] [blame]
Gregory Maxwellae231142011-07-30 08:18:48 -04001/***********************************************************************
2Copyright (c) 2006-2011, Skype Limited. All rights reserved.
3Redistribution and use in source and binary forms, with or without
4modification, (subject to the limitations in the disclaimer below)
5are permitted provided that the following conditions are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Skype Limited, nor the names of specific
12contributors, may be used to endorse or promote products derived from
13this software without specific prior written permission.
14NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED
15BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
16CONTRIBUTORS ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
17BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
18FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
22USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
23ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
Jean-Marc Valin5a484122011-08-15 10:49:53 -040028#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
Gregory Maxwellae231142011-07-30 08:18:48 -040032#include <stdlib.h>
33#include "silk_main.h"
34
35/**********************************/
36/* Initialization of the Silk VAD */
37/**********************************/
38opus_int silk_VAD_Init( /* O Return value, 0 if success */
39 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
40)
41{
42 opus_int b, ret = 0;
43
44 /* reset state memory */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -070045 silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
Gregory Maxwellae231142011-07-30 08:18:48 -040046
47 /* init noise levels */
48 /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
49 for( b = 0; b < VAD_N_BANDS; b++ ) {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -070050 psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
Gregory Maxwellae231142011-07-30 08:18:48 -040051 }
52
53 /* Initialize state */
54 for( b = 0; b < VAD_N_BANDS; b++ ) {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -070055 psSilk_VAD->NL[ b ] = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
56 psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
Gregory Maxwellae231142011-07-30 08:18:48 -040057 }
58 psSilk_VAD->counter = 15;
59
60 /* init smoothed energy-to-noise ratio*/
61 for( b = 0; b < VAD_N_BANDS; b++ ) {
62 psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256; /* 100 * 256 --> 20 dB SNR */
63 }
64
65 return( ret );
66}
67
68/* Weighting factors for tilt measure */
69static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
70
71/***************************************/
72/* Get the speech activity level in Q8 */
73/***************************************/
74opus_int silk_VAD_GetSA_Q8( /* O Return value, 0 if success */
75 silk_encoder_state *psEncC, /* I/O Encoder state */
76 const opus_int16 pIn[] /* I PCM input */
77)
78{
79 opus_int SA_Q15, pSNR_dB_Q7, input_tilt;
80 opus_int decimated_framelength, dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
81 opus_int32 sumSquared, smooth_coef_Q16;
82 opus_int16 HPstateTmp;
83 opus_int16 X[ VAD_N_BANDS ][ MAX_FRAME_LENGTH / 2 ];
84 opus_int32 Xnrg[ VAD_N_BANDS ];
85 opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
86 opus_int32 speech_nrg, x_tmp;
87 opus_int ret = 0;
88 silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
89
90 /* Safety checks */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -070091 silk_assert( VAD_N_BANDS == 4 );
92 silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
93 silk_assert( psEncC->frame_length <= 512 );
94 silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
Gregory Maxwellae231142011-07-30 08:18:48 -040095
96 /***********************/
97 /* Filter and Decimate */
98 /***********************/
99 /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
100 silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[ 0 ], &X[ 0 ][ 0 ], &X[ 3 ][ 0 ], psEncC->frame_length );
101
102 /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700103 silk_ana_filt_bank_1( &X[ 0 ][ 0 ], &psSilk_VAD->AnaState1[ 0 ], &X[ 0 ][ 0 ], &X[ 2 ][ 0 ], silk_RSHIFT( psEncC->frame_length, 1 ) );
Gregory Maxwellae231142011-07-30 08:18:48 -0400104
105 /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700106 silk_ana_filt_bank_1( &X[ 0 ][ 0 ], &psSilk_VAD->AnaState2[ 0 ], &X[ 0 ][ 0 ], &X[ 1 ][ 0 ], silk_RSHIFT( psEncC->frame_length, 2 ) );
Gregory Maxwellae231142011-07-30 08:18:48 -0400107
108 /*********************************************/
109 /* HP filter on lowest band (differentiator) */
110 /*********************************************/
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700111 decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
112 X[ 0 ][ decimated_framelength - 1 ] = silk_RSHIFT( X[ 0 ][ decimated_framelength - 1 ], 1 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400113 HPstateTmp = X[ 0 ][ decimated_framelength - 1 ];
114 for( i = decimated_framelength - 1; i > 0; i-- ) {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700115 X[ 0 ][ i - 1 ] = silk_RSHIFT( X[ 0 ][ i - 1 ], 1 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400116 X[ 0 ][ i ] -= X[ 0 ][ i - 1 ];
117 }
118 X[ 0 ][ 0 ] -= psSilk_VAD->HPstate;
119 psSilk_VAD->HPstate = HPstateTmp;
120
121 /*************************************/
122 /* Calculate the energy in each band */
123 /*************************************/
124 for( b = 0; b < VAD_N_BANDS; b++ ) {
125 /* Find the decimated framelength in the non-uniformly divided bands */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700126 decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
Gregory Maxwellae231142011-07-30 08:18:48 -0400127
128 /* Split length into subframe lengths */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700129 dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400130 dec_subframe_offset = 0;
131
132 /* Compute energy per sub-frame */
133 /* initialize with summed energy of last subframe */
134 Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
135 for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
136 sumSquared = 0;
137 for( i = 0; i < dec_subframe_length; i++ ) {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700138 /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2. */
Gregory Maxwellae231142011-07-30 08:18:48 -0400139 /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128) */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700140 x_tmp = silk_RSHIFT( X[ b ][ i + dec_subframe_offset ], 3 );
141 sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
Gregory Maxwellae231142011-07-30 08:18:48 -0400142
143 /* Safety check */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700144 silk_assert( sumSquared >= 0 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400145 }
146
147 /* Add/saturate summed energy of current subframe */
148 if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700149 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
Gregory Maxwellae231142011-07-30 08:18:48 -0400150 } else {
151 /* Look-ahead subframe */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700152 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
Gregory Maxwellae231142011-07-30 08:18:48 -0400153 }
154
155 dec_subframe_offset += dec_subframe_length;
156 }
157 psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
158 }
159
160 /********************/
161 /* Noise estimation */
162 /********************/
163 silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
164
165 /***********************************************/
166 /* Signal-plus-noise to noise ratio estimation */
167 /***********************************************/
168 sumSquared = 0;
169 input_tilt = 0;
170 for( b = 0; b < VAD_N_BANDS; b++ ) {
171 speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
172 if( speech_nrg > 0 ) {
173 /* Divide, with sufficient resolution */
174 if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700175 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400176 } else {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700177 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400178 }
179
180 /* Convert to log domain */
181 SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
182
183 /* Sum-of-squares */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700184 sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 ); /* Q14 */
Gregory Maxwellae231142011-07-30 08:18:48 -0400185
186 /* Tilt measure */
187 if( speech_nrg < ( 1 << 20 ) ) {
188 /* Scale down SNR value for small subband speech energies */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700189 SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400190 }
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700191 input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400192 } else {
193 NrgToNoiseRatio_Q8[ b ] = 256;
194 }
195 }
196
197 /* Mean-of-squares */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700198 sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
Gregory Maxwellae231142011-07-30 08:18:48 -0400199
200 /* Root-mean-square approximation, scale to dBs, and write to output pointer */
201 pSNR_dB_Q7 = ( opus_int16 )( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
202
203 /*********************************/
204 /* Speech Probability Estimation */
205 /*********************************/
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700206 SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400207
208 /**************************/
209 /* Frequency Tilt Measure */
210 /**************************/
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700211 psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400212
213 /**************************************************/
214 /* Scale the sigmoid output based on power levels */
215 /**************************************************/
216 speech_nrg = 0;
217 for( b = 0; b < VAD_N_BANDS; b++ ) {
218 /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700219 speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400220 }
221
222 /* Power scaling */
223 if( speech_nrg <= 0 ) {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700224 SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400225 } else if( speech_nrg < 32768 ) {
226 if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700227 speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400228 } else {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700229 speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400230 }
231
232 /* square-root */
233 speech_nrg = silk_SQRT_APPROX( speech_nrg );
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700234 SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400235 }
236
237 /* Copy the resulting speech activity in Q8 */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700238 psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
Gregory Maxwellae231142011-07-30 08:18:48 -0400239
240 /***********************************/
241 /* Energy Level and SNR estimation */
242 /***********************************/
243 /* Smoothing coefficient */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700244 smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( SA_Q15, SA_Q15 ) );
Gregory Maxwellae231142011-07-30 08:18:48 -0400245
246 if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
247 smooth_coef_Q16 >>= 1;
248 }
249
250 for( b = 0; b < VAD_N_BANDS; b++ ) {
251 /* compute smoothed energy-to-noise ratio per band */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700252 psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
Gregory Maxwellae231142011-07-30 08:18:48 -0400253 NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
254
255 /* signal to noise ratio in dB per band */
256 SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
257 /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700258 psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
Gregory Maxwellae231142011-07-30 08:18:48 -0400259 }
260
261 return( ret );
262}
263
264/**************************/
265/* Noise level estimation */
266/**************************/
267void silk_VAD_GetNoiseLevels(
268 const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
269 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
270)
271{
272 opus_int k;
273 opus_int32 nl, nrg, inv_nrg;
274 opus_int coef, min_coef;
275
276 /* Initially faster smoothing */
277 if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700278 min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400279 } else {
280 min_coef = 0;
281 }
282
283 for( k = 0; k < VAD_N_BANDS; k++ ) {
284 /* Get old noise level estimate for current band */
285 nl = psSilk_VAD->NL[ k ];
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700286 silk_assert( nl >= 0 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400287
288 /* Add bias */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700289 nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
290 silk_assert( nrg > 0 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400291
292 /* Invert energies */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700293 inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
294 silk_assert( inv_nrg >= 0 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400295
296 /* Less update when subband energy is high */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700297 if( nrg > silk_LSHIFT( nl, 3 ) ) {
Gregory Maxwellae231142011-07-30 08:18:48 -0400298 coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
299 } else if( nrg < nl ) {
300 coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
301 } else {
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700302 coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400303 }
304
305 /* Initially faster smoothing */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700306 coef = silk_max_int( coef, min_coef );
Gregory Maxwellae231142011-07-30 08:18:48 -0400307
308 /* Smooth inverse energies */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700309 psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
310 silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400311
312 /* Compute noise level by inverting again */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700313 nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
314 silk_assert( nl >= 0 );
Gregory Maxwellae231142011-07-30 08:18:48 -0400315
316 /* Limit noise levels (guarantee 7 bits of head room) */
Jean-Marc Valinfb3a4372011-09-16 00:58:26 -0700317 nl = silk_min( nl, 0x00FFFFFF );
Gregory Maxwellae231142011-07-30 08:18:48 -0400318
319 /* Store as part of state */
320 psSilk_VAD->NL[ k ] = nl;
321 }
322
323 /* Increment frame counter */
324 psSilk_VAD->counter++;
325}