andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
pbos@webrtc.org | abf0cd8 | 2013-05-27 09:49:58 +0000 | [diff] [blame] | 11 | #include "webrtc/common_audio/vad/vad_core.h" |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 12 | |
pbos@webrtc.org | abf0cd8 | 2013-05-27 09:49:58 +0000 | [diff] [blame] | 13 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" |
| 14 | #include "webrtc/common_audio/vad/vad_filterbank.h" |
| 15 | #include "webrtc/common_audio/vad/vad_gmm.h" |
| 16 | #include "webrtc/common_audio/vad/vad_sp.h" |
| 17 | #include "webrtc/typedefs.h" |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 18 | |
| 19 | // Spectrum Weighting |
| 20 | static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 }; |
| 21 | static const int16_t kNoiseUpdateConst = 655; // Q15 |
| 22 | static const int16_t kSpeechUpdateConst = 6554; // Q15 |
| 23 | static const int16_t kBackEta = 154; // Q8 |
| 24 | // Minimum difference between the two models, Q5 |
| 25 | static const int16_t kMinimumDifference[kNumChannels] = { |
| 26 | 544, 544, 576, 576, 576, 576 }; |
| 27 | // Upper limit of mean value for speech model, Q7 |
| 28 | static const int16_t kMaximumSpeech[kNumChannels] = { |
| 29 | 11392, 11392, 11520, 11520, 11520, 11520 }; |
| 30 | // Minimum value for mean value |
| 31 | static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 }; |
| 32 | // Upper limit of mean value for noise model, Q7 |
| 33 | static const int16_t kMaximumNoise[kNumChannels] = { |
| 34 | 9216, 9088, 8960, 8832, 8704, 8576 }; |
| 35 | // Start values for the Gaussian models, Q7 |
| 36 | // Weights for the two Gaussians for the six channels (noise) |
| 37 | static const int16_t kNoiseDataWeights[kTableSize] = { |
| 38 | 34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 }; |
| 39 | // Weights for the two Gaussians for the six channels (speech) |
| 40 | static const int16_t kSpeechDataWeights[kTableSize] = { |
| 41 | 48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 }; |
| 42 | // Means for the two Gaussians for the six channels (noise) |
| 43 | static const int16_t kNoiseDataMeans[kTableSize] = { |
| 44 | 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 }; |
| 45 | // Means for the two Gaussians for the six channels (speech) |
| 46 | static const int16_t kSpeechDataMeans[kTableSize] = { |
| 47 | 8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483 |
| 48 | }; |
| 49 | // Stds for the two Gaussians for the six channels (noise) |
| 50 | static const int16_t kNoiseDataStds[kTableSize] = { |
| 51 | 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 }; |
| 52 | // Stds for the two Gaussians for the six channels (speech) |
| 53 | static const int16_t kSpeechDataStds[kTableSize] = { |
| 54 | 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 }; |
| 55 | |
| 56 | // Constants used in GmmProbability(). |
| 57 | // |
| 58 | // Maximum number of counted speech (VAD = 1) frames in a row. |
| 59 | static const int16_t kMaxSpeechFrames = 6; |
| 60 | // Minimum standard deviation for both speech and noise. |
| 61 | static const int16_t kMinStd = 384; |
| 62 | |
| 63 | // Constants in WebRtcVad_InitCore(). |
| 64 | // Default aggressiveness mode. |
| 65 | static const short kDefaultMode = 0; |
| 66 | static const int kInitCheck = 42; |
| 67 | |
| 68 | // Constants used in WebRtcVad_set_mode_core(). |
| 69 | // |
| 70 | // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms). |
| 71 | // |
| 72 | // Mode 0, Quality. |
| 73 | static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 }; |
| 74 | static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 }; |
| 75 | static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 }; |
| 76 | static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 }; |
| 77 | // Mode 1, Low bitrate. |
| 78 | static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 }; |
| 79 | static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 }; |
| 80 | static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 }; |
| 81 | static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 }; |
| 82 | // Mode 2, Aggressive. |
| 83 | static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 }; |
| 84 | static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 }; |
| 85 | static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 }; |
| 86 | static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 }; |
| 87 | // Mode 3, Very aggressive. |
| 88 | static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 }; |
| 89 | static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 }; |
| 90 | static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 }; |
| 91 | static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 }; |
| 92 | |
| 93 | // Calculates the weighted average w.r.t. number of Gaussians. The |data| are |
| 94 | // updated with an |offset| before averaging. |
| 95 | // |
| 96 | // - data [i/o] : Data to average. |
| 97 | // - offset [i] : An offset added to |data|. |
| 98 | // - weights [i] : Weights used for averaging. |
| 99 | // |
| 100 | // returns : The weighted average. |
| 101 | static int32_t WeightedAverage(int16_t* data, int16_t offset, |
| 102 | const int16_t* weights) { |
| 103 | int k; |
| 104 | int32_t weighted_average = 0; |
| 105 | |
| 106 | for (k = 0; k < kNumGaussians; k++) { |
| 107 | data[k * kNumChannels] += offset; |
| 108 | weighted_average += data[k * kNumChannels] * weights[k * kNumChannels]; |
| 109 | } |
| 110 | return weighted_average; |
| 111 | } |
| 112 | |
| 113 | // Calculates the probabilities for both speech and background noise using |
| 114 | // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which |
| 115 | // type of signal is most probable. |
| 116 | // |
| 117 | // - self [i/o] : Pointer to VAD instance |
| 118 | // - features [i] : Feature vector of length |kNumChannels| |
| 119 | // = log10(energy in frequency band) |
| 120 | // - total_power [i] : Total power in audio frame. |
| 121 | // - frame_length [i] : Number of input samples |
| 122 | // |
| 123 | // - returns : the VAD decision (0 - noise, 1 - speech). |
| 124 | static int16_t GmmProbability(VadInstT* self, int16_t* features, |
| 125 | int16_t total_power, int frame_length) { |
| 126 | int channel, k; |
| 127 | int16_t feature_minimum; |
| 128 | int16_t h0, h1; |
| 129 | int16_t log_likelihood_ratio; |
| 130 | int16_t vadflag = 0; |
| 131 | int16_t shifts_h0, shifts_h1; |
| 132 | int16_t tmp_s16, tmp1_s16, tmp2_s16; |
| 133 | int16_t diff; |
| 134 | int gaussian; |
| 135 | int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk; |
| 136 | int16_t delt, ndelt; |
| 137 | int16_t maxspe, maxmu; |
| 138 | int16_t deltaN[kTableSize], deltaS[kTableSize]; |
| 139 | int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0. |
| 140 | int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0. |
| 141 | int32_t h0_test, h1_test; |
| 142 | int32_t tmp1_s32, tmp2_s32; |
| 143 | int32_t sum_log_likelihood_ratios = 0; |
| 144 | int32_t noise_global_mean, speech_global_mean; |
| 145 | int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians]; |
| 146 | int16_t overhead1, overhead2, individualTest, totalTest; |
| 147 | |
| 148 | // Set various thresholds based on frame lengths (80, 160 or 240 samples). |
| 149 | if (frame_length == 80) { |
| 150 | overhead1 = self->over_hang_max_1[0]; |
| 151 | overhead2 = self->over_hang_max_2[0]; |
| 152 | individualTest = self->individual[0]; |
| 153 | totalTest = self->total[0]; |
| 154 | } else if (frame_length == 160) { |
| 155 | overhead1 = self->over_hang_max_1[1]; |
| 156 | overhead2 = self->over_hang_max_2[1]; |
| 157 | individualTest = self->individual[1]; |
| 158 | totalTest = self->total[1]; |
| 159 | } else { |
| 160 | overhead1 = self->over_hang_max_1[2]; |
| 161 | overhead2 = self->over_hang_max_2[2]; |
| 162 | individualTest = self->individual[2]; |
| 163 | totalTest = self->total[2]; |
| 164 | } |
| 165 | |
| 166 | if (total_power > kMinEnergy) { |
| 167 | // The signal power of current frame is large enough for processing. The |
| 168 | // processing consists of two parts: |
| 169 | // 1) Calculating the likelihood of speech and thereby a VAD decision. |
| 170 | // 2) Updating the underlying model, w.r.t., the decision made. |
| 171 | |
| 172 | // The detection scheme is an LRT with hypothesis |
| 173 | // H0: Noise |
| 174 | // H1: Speech |
| 175 | // |
| 176 | // We combine a global LRT with local tests, for each frequency sub-band, |
| 177 | // here defined as |channel|. |
| 178 | for (channel = 0; channel < kNumChannels; channel++) { |
| 179 | // For each channel we model the probability with a GMM consisting of |
| 180 | // |kNumGaussians|, with different means and standard deviations depending |
| 181 | // on H0 or H1. |
| 182 | h0_test = 0; |
| 183 | h1_test = 0; |
| 184 | for (k = 0; k < kNumGaussians; k++) { |
| 185 | gaussian = channel + k * kNumChannels; |
| 186 | // Probability under H0, that is, probability of frame being noise. |
| 187 | // Value given in Q27 = Q7 * Q20. |
| 188 | tmp1_s32 = WebRtcVad_GaussianProbability(features[channel], |
| 189 | self->noise_means[gaussian], |
| 190 | self->noise_stds[gaussian], |
| 191 | &deltaN[gaussian]); |
| 192 | noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32; |
| 193 | h0_test += noise_probability[k]; // Q27 |
| 194 | |
| 195 | // Probability under H1, that is, probability of frame being speech. |
| 196 | // Value given in Q27 = Q7 * Q20. |
| 197 | tmp1_s32 = WebRtcVad_GaussianProbability(features[channel], |
| 198 | self->speech_means[gaussian], |
| 199 | self->speech_stds[gaussian], |
| 200 | &deltaS[gaussian]); |
| 201 | speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32; |
| 202 | h1_test += speech_probability[k]; // Q27 |
| 203 | } |
| 204 | |
| 205 | // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}). |
| 206 | // Approximation: |
| 207 | // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q) |
| 208 | // = log2(h1_test) - log2(h0_test) |
| 209 | // = log2(2^(31-shifts_h1)*(1+b1)) |
| 210 | // - log2(2^(31-shifts_h0)*(1+b0)) |
| 211 | // = shifts_h0 - shifts_h1 |
| 212 | // + log2(1+b1) - log2(1+b0) |
| 213 | // ~= shifts_h0 - shifts_h1 |
| 214 | // |
| 215 | // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1. |
| 216 | // Further, b0 and b1 are independent and on the average the two terms |
| 217 | // cancel. |
| 218 | shifts_h0 = WebRtcSpl_NormW32(h0_test); |
| 219 | shifts_h1 = WebRtcSpl_NormW32(h1_test); |
| 220 | if (h0_test == 0) { |
| 221 | shifts_h0 = 31; |
| 222 | } |
| 223 | if (h1_test == 0) { |
| 224 | shifts_h1 = 31; |
| 225 | } |
| 226 | log_likelihood_ratio = shifts_h0 - shifts_h1; |
| 227 | |
| 228 | // Update |sum_log_likelihood_ratios| with spectrum weighting. This is |
| 229 | // used for the global VAD decision. |
| 230 | sum_log_likelihood_ratios += |
| 231 | (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]); |
| 232 | |
| 233 | // Local VAD decision. |
| 234 | if ((log_likelihood_ratio << 2) > individualTest) { |
| 235 | vadflag = 1; |
| 236 | } |
| 237 | |
| 238 | // TODO(bjornv): The conditional probabilities below are applied on the |
| 239 | // hard coded number of Gaussians set to two. Find a way to generalize. |
| 240 | // Calculate local noise probabilities used later when updating the GMM. |
| 241 | h0 = (int16_t) (h0_test >> 12); // Q15 |
| 242 | if (h0 > 0) { |
| 243 | // High probability of noise. Assign conditional probabilities for each |
| 244 | // Gaussian in the GMM. |
| 245 | tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29 |
| 246 | ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14 |
| 247 | ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel]; |
| 248 | } else { |
| 249 | // Low noise probability. Assign conditional probability 1 to the first |
| 250 | // Gaussian and 0 to the rest (which is already set at initialization). |
| 251 | ngprvec[channel] = 16384; |
| 252 | } |
| 253 | |
| 254 | // Calculate local speech probabilities used later when updating the GMM. |
| 255 | h1 = (int16_t) (h1_test >> 12); // Q15 |
| 256 | if (h1 > 0) { |
| 257 | // High probability of speech. Assign conditional probabilities for each |
| 258 | // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0. |
| 259 | tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29 |
| 260 | sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14 |
| 261 | sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel]; |
| 262 | } |
| 263 | } |
| 264 | |
| 265 | // Make a global VAD decision. |
| 266 | vadflag |= (sum_log_likelihood_ratios >= totalTest); |
| 267 | |
| 268 | // Update the model parameters. |
| 269 | maxspe = 12800; |
| 270 | for (channel = 0; channel < kNumChannels; channel++) { |
| 271 | |
| 272 | // Get minimum value in past which is used for long term correction in Q4. |
| 273 | feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel); |
| 274 | |
| 275 | // Compute the "global" mean, that is the sum of the two means weighted. |
| 276 | noise_global_mean = WeightedAverage(&self->noise_means[channel], 0, |
| 277 | &kNoiseDataWeights[channel]); |
| 278 | tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8 |
| 279 | |
| 280 | for (k = 0; k < kNumGaussians; k++) { |
| 281 | gaussian = channel + k * kNumChannels; |
| 282 | |
| 283 | nmk = self->noise_means[gaussian]; |
| 284 | smk = self->speech_means[gaussian]; |
| 285 | nsk = self->noise_stds[gaussian]; |
| 286 | ssk = self->speech_stds[gaussian]; |
| 287 | |
| 288 | // Update noise mean vector if the frame consists of noise only. |
| 289 | nmk2 = nmk; |
| 290 | if (!vadflag) { |
| 291 | // deltaN = (x-mu)/sigma^2 |
| 292 | // ngprvec[k] = |noise_probability[k]| / |
| 293 | // (|noise_probability[0]| + |noise_probability[1]|) |
| 294 | |
| 295 | // (Q14 * Q11 >> 11) = Q14. |
| 296 | delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[gaussian], |
| 297 | deltaN[gaussian], |
| 298 | 11); |
| 299 | // Q7 + (Q14 * Q15 >> 22) = Q7. |
| 300 | nmk2 = nmk + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt, |
| 301 | kNoiseUpdateConst, |
| 302 | 22); |
| 303 | } |
| 304 | |
| 305 | // Long term correction of the noise mean. |
| 306 | // Q8 - Q8 = Q8. |
| 307 | ndelt = (feature_minimum << 4) - tmp1_s16; |
| 308 | // Q7 + (Q8 * Q8) >> 9 = Q7. |
| 309 | nmk3 = nmk2 + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ndelt, kBackEta, 9); |
| 310 | |
| 311 | // Control that the noise mean does not drift to much. |
| 312 | tmp_s16 = (int16_t) ((k + 5) << 7); |
| 313 | if (nmk3 < tmp_s16) { |
| 314 | nmk3 = tmp_s16; |
| 315 | } |
| 316 | tmp_s16 = (int16_t) ((72 + k - channel) << 7); |
| 317 | if (nmk3 > tmp_s16) { |
| 318 | nmk3 = tmp_s16; |
| 319 | } |
| 320 | self->noise_means[gaussian] = nmk3; |
| 321 | |
| 322 | if (vadflag) { |
| 323 | // Update speech mean vector: |
| 324 | // |deltaS| = (x-mu)/sigma^2 |
| 325 | // sgprvec[k] = |speech_probability[k]| / |
| 326 | // (|speech_probability[0]| + |speech_probability[1]|) |
| 327 | |
| 328 | // (Q14 * Q11) >> 11 = Q14. |
| 329 | delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[gaussian], |
| 330 | deltaS[gaussian], |
| 331 | 11); |
| 332 | // Q14 * Q15 >> 21 = Q8. |
| 333 | tmp_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt, |
| 334 | kSpeechUpdateConst, |
| 335 | 21); |
| 336 | // Q7 + (Q8 >> 1) = Q7. With rounding. |
| 337 | smk2 = smk + ((tmp_s16 + 1) >> 1); |
| 338 | |
| 339 | // Control that the speech mean does not drift to much. |
| 340 | maxmu = maxspe + 640; |
| 341 | if (smk2 < kMinimumMean[k]) { |
| 342 | smk2 = kMinimumMean[k]; |
| 343 | } |
| 344 | if (smk2 > maxmu) { |
| 345 | smk2 = maxmu; |
| 346 | } |
| 347 | self->speech_means[gaussian] = smk2; // Q7. |
| 348 | |
| 349 | // (Q7 >> 3) = Q4. With rounding. |
| 350 | tmp_s16 = ((smk + 4) >> 3); |
| 351 | |
| 352 | tmp_s16 = features[channel] - tmp_s16; // Q4 |
| 353 | // (Q11 * Q4 >> 3) = Q12. |
| 354 | tmp1_s32 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[gaussian], tmp_s16, 3); |
| 355 | tmp2_s32 = tmp1_s32 - 4096; |
| 356 | tmp_s16 = sgprvec[gaussian] >> 2; |
| 357 | // (Q14 >> 2) * Q12 = Q24. |
| 358 | tmp1_s32 = tmp_s16 * tmp2_s32; |
| 359 | |
| 360 | tmp2_s32 = tmp1_s32 >> 4; // Q20 |
| 361 | |
| 362 | // 0.1 * Q20 / Q7 = Q13. |
| 363 | if (tmp2_s32 > 0) { |
| 364 | tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10); |
| 365 | } else { |
| 366 | tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10); |
| 367 | tmp_s16 = -tmp_s16; |
| 368 | } |
| 369 | // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4). |
| 370 | // Note that division by 4 equals shift by 2, hence, |
| 371 | // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7. |
| 372 | tmp_s16 += 128; // Rounding. |
| 373 | ssk += (tmp_s16 >> 8); |
| 374 | if (ssk < kMinStd) { |
| 375 | ssk = kMinStd; |
| 376 | } |
| 377 | self->speech_stds[gaussian] = ssk; |
| 378 | } else { |
| 379 | // Update GMM variance vectors. |
| 380 | // deltaN * (features[channel] - nmk) - 1 |
| 381 | // Q4 - (Q7 >> 3) = Q4. |
| 382 | tmp_s16 = features[channel] - (nmk >> 3); |
| 383 | // (Q11 * Q4 >> 3) = Q12. |
| 384 | tmp1_s32 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[gaussian], tmp_s16, 3); |
| 385 | tmp1_s32 -= 4096; |
| 386 | |
| 387 | // (Q14 >> 2) * Q12 = Q24. |
| 388 | tmp_s16 = (ngprvec[gaussian] + 2) >> 2; |
| 389 | tmp2_s32 = tmp_s16 * tmp1_s32; |
| 390 | // Q20 * approx 0.001 (2^-10=0.0009766), hence, |
| 391 | // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20. |
| 392 | tmp1_s32 = tmp2_s32 >> 14; |
| 393 | |
| 394 | // Q20 / Q7 = Q13. |
| 395 | if (tmp1_s32 > 0) { |
| 396 | tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk); |
| 397 | } else { |
| 398 | tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk); |
| 399 | tmp_s16 = -tmp_s16; |
| 400 | } |
| 401 | tmp_s16 += 32; // Rounding |
| 402 | nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7. |
| 403 | if (nsk < kMinStd) { |
| 404 | nsk = kMinStd; |
| 405 | } |
| 406 | self->noise_stds[gaussian] = nsk; |
| 407 | } |
| 408 | } |
| 409 | |
| 410 | // Separate models if they are too close. |
| 411 | // |noise_global_mean| in Q14 (= Q7 * Q7). |
| 412 | noise_global_mean = WeightedAverage(&self->noise_means[channel], 0, |
| 413 | &kNoiseDataWeights[channel]); |
| 414 | |
| 415 | // |speech_global_mean| in Q14 (= Q7 * Q7). |
| 416 | speech_global_mean = WeightedAverage(&self->speech_means[channel], 0, |
| 417 | &kSpeechDataWeights[channel]); |
| 418 | |
| 419 | // |diff| = "global" speech mean - "global" noise mean. |
| 420 | // (Q14 >> 9) - (Q14 >> 9) = Q5. |
| 421 | diff = (int16_t) (speech_global_mean >> 9) - |
| 422 | (int16_t) (noise_global_mean >> 9); |
| 423 | if (diff < kMinimumDifference[channel]) { |
| 424 | tmp_s16 = kMinimumDifference[channel] - diff; |
| 425 | |
| 426 | // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7. |
| 427 | // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7. |
| 428 | tmp1_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(13, tmp_s16, 2); |
| 429 | tmp2_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(3, tmp_s16, 2); |
| 430 | |
| 431 | // Move Gaussian means for speech model by |tmp1_s16| and update |
| 432 | // |speech_global_mean|. Note that |self->speech_means[channel]| is |
| 433 | // changed after the call. |
| 434 | speech_global_mean = WeightedAverage(&self->speech_means[channel], |
| 435 | tmp1_s16, |
| 436 | &kSpeechDataWeights[channel]); |
| 437 | |
| 438 | // Move Gaussian means for noise model by -|tmp2_s16| and update |
| 439 | // |noise_global_mean|. Note that |self->noise_means[channel]| is |
| 440 | // changed after the call. |
| 441 | noise_global_mean = WeightedAverage(&self->noise_means[channel], |
| 442 | -tmp2_s16, |
| 443 | &kNoiseDataWeights[channel]); |
| 444 | } |
| 445 | |
| 446 | // Control that the speech & noise means do not drift to much. |
| 447 | maxspe = kMaximumSpeech[channel]; |
| 448 | tmp2_s16 = (int16_t) (speech_global_mean >> 7); |
| 449 | if (tmp2_s16 > maxspe) { |
| 450 | // Upper limit of speech model. |
| 451 | tmp2_s16 -= maxspe; |
| 452 | |
| 453 | for (k = 0; k < kNumGaussians; k++) { |
| 454 | self->speech_means[channel + k * kNumChannels] -= tmp2_s16; |
| 455 | } |
| 456 | } |
| 457 | |
| 458 | tmp2_s16 = (int16_t) (noise_global_mean >> 7); |
| 459 | if (tmp2_s16 > kMaximumNoise[channel]) { |
| 460 | tmp2_s16 -= kMaximumNoise[channel]; |
| 461 | |
| 462 | for (k = 0; k < kNumGaussians; k++) { |
| 463 | self->noise_means[channel + k * kNumChannels] -= tmp2_s16; |
| 464 | } |
| 465 | } |
| 466 | } |
| 467 | self->frame_counter++; |
| 468 | } |
| 469 | |
| 470 | // Smooth with respect to transition hysteresis. |
| 471 | if (!vadflag) { |
| 472 | if (self->over_hang > 0) { |
| 473 | vadflag = 2 + self->over_hang; |
| 474 | self->over_hang--; |
| 475 | } |
| 476 | self->num_of_speech = 0; |
| 477 | } else { |
| 478 | self->num_of_speech++; |
| 479 | if (self->num_of_speech > kMaxSpeechFrames) { |
| 480 | self->num_of_speech = kMaxSpeechFrames; |
| 481 | self->over_hang = overhead2; |
| 482 | } else { |
| 483 | self->over_hang = overhead1; |
| 484 | } |
| 485 | } |
| 486 | return vadflag; |
| 487 | } |
| 488 | |
| 489 | // Initialize the VAD. Set aggressiveness mode to default value. |
| 490 | int WebRtcVad_InitCore(VadInstT* self) { |
| 491 | int i; |
| 492 | |
| 493 | if (self == NULL) { |
| 494 | return -1; |
| 495 | } |
| 496 | |
| 497 | // Initialization of general struct variables. |
| 498 | self->vad = 1; // Speech active (=1). |
| 499 | self->frame_counter = 0; |
| 500 | self->over_hang = 0; |
| 501 | self->num_of_speech = 0; |
| 502 | |
| 503 | // Initialization of downsampling filter state. |
| 504 | memset(self->downsampling_filter_states, 0, |
| 505 | sizeof(self->downsampling_filter_states)); |
| 506 | |
| 507 | // Initialization of 48 to 8 kHz downsampling. |
| 508 | WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8); |
| 509 | |
| 510 | // Read initial PDF parameters. |
| 511 | for (i = 0; i < kTableSize; i++) { |
| 512 | self->noise_means[i] = kNoiseDataMeans[i]; |
| 513 | self->speech_means[i] = kSpeechDataMeans[i]; |
| 514 | self->noise_stds[i] = kNoiseDataStds[i]; |
| 515 | self->speech_stds[i] = kSpeechDataStds[i]; |
| 516 | } |
| 517 | |
| 518 | // Initialize Index and Minimum value vectors. |
| 519 | for (i = 0; i < 16 * kNumChannels; i++) { |
| 520 | self->low_value_vector[i] = 10000; |
| 521 | self->index_vector[i] = 0; |
| 522 | } |
| 523 | |
| 524 | // Initialize splitting filter states. |
| 525 | memset(self->upper_state, 0, sizeof(self->upper_state)); |
| 526 | memset(self->lower_state, 0, sizeof(self->lower_state)); |
| 527 | |
| 528 | // Initialize high pass filter states. |
| 529 | memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state)); |
| 530 | |
| 531 | // Initialize mean value memory, for WebRtcVad_FindMinimum(). |
| 532 | for (i = 0; i < kNumChannels; i++) { |
| 533 | self->mean_value[i] = 1600; |
| 534 | } |
| 535 | |
| 536 | // Set aggressiveness mode to default (=|kDefaultMode|). |
| 537 | if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) { |
| 538 | return -1; |
| 539 | } |
| 540 | |
| 541 | self->init_flag = kInitCheck; |
| 542 | |
| 543 | return 0; |
| 544 | } |
| 545 | |
| 546 | // Set aggressiveness mode |
| 547 | int WebRtcVad_set_mode_core(VadInstT* self, int mode) { |
| 548 | int return_value = 0; |
| 549 | |
| 550 | switch (mode) { |
| 551 | case 0: |
| 552 | // Quality mode. |
| 553 | memcpy(self->over_hang_max_1, kOverHangMax1Q, |
| 554 | sizeof(self->over_hang_max_1)); |
| 555 | memcpy(self->over_hang_max_2, kOverHangMax2Q, |
| 556 | sizeof(self->over_hang_max_2)); |
| 557 | memcpy(self->individual, kLocalThresholdQ, |
| 558 | sizeof(self->individual)); |
| 559 | memcpy(self->total, kGlobalThresholdQ, |
| 560 | sizeof(self->total)); |
| 561 | break; |
| 562 | case 1: |
| 563 | // Low bitrate mode. |
| 564 | memcpy(self->over_hang_max_1, kOverHangMax1LBR, |
| 565 | sizeof(self->over_hang_max_1)); |
| 566 | memcpy(self->over_hang_max_2, kOverHangMax2LBR, |
| 567 | sizeof(self->over_hang_max_2)); |
| 568 | memcpy(self->individual, kLocalThresholdLBR, |
| 569 | sizeof(self->individual)); |
| 570 | memcpy(self->total, kGlobalThresholdLBR, |
| 571 | sizeof(self->total)); |
| 572 | break; |
| 573 | case 2: |
| 574 | // Aggressive mode. |
| 575 | memcpy(self->over_hang_max_1, kOverHangMax1AGG, |
| 576 | sizeof(self->over_hang_max_1)); |
| 577 | memcpy(self->over_hang_max_2, kOverHangMax2AGG, |
| 578 | sizeof(self->over_hang_max_2)); |
| 579 | memcpy(self->individual, kLocalThresholdAGG, |
| 580 | sizeof(self->individual)); |
| 581 | memcpy(self->total, kGlobalThresholdAGG, |
| 582 | sizeof(self->total)); |
| 583 | break; |
| 584 | case 3: |
| 585 | // Very aggressive mode. |
| 586 | memcpy(self->over_hang_max_1, kOverHangMax1VAG, |
| 587 | sizeof(self->over_hang_max_1)); |
| 588 | memcpy(self->over_hang_max_2, kOverHangMax2VAG, |
| 589 | sizeof(self->over_hang_max_2)); |
| 590 | memcpy(self->individual, kLocalThresholdVAG, |
| 591 | sizeof(self->individual)); |
| 592 | memcpy(self->total, kGlobalThresholdVAG, |
| 593 | sizeof(self->total)); |
| 594 | break; |
| 595 | default: |
| 596 | return_value = -1; |
| 597 | break; |
| 598 | } |
| 599 | |
| 600 | return return_value; |
| 601 | } |
| 602 | |
| 603 | // Calculate VAD decision by first extracting feature values and then calculate |
| 604 | // probability for both speech and background noise. |
| 605 | |
andrew@webrtc.org | c2e6438 | 2014-04-30 16:44:13 +0000 | [diff] [blame] | 606 | int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame, |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 607 | int frame_length) { |
| 608 | int vad; |
| 609 | int i; |
| 610 | int16_t speech_nb[240]; // 30 ms in 8 kHz. |
| 611 | // |tmp_mem| is a temporary memory used by resample function, length is |
| 612 | // frame length in 10 ms (480 samples) + 256 extra. |
| 613 | int32_t tmp_mem[480 + 256] = { 0 }; |
| 614 | const int kFrameLen10ms48khz = 480; |
| 615 | const int kFrameLen10ms8khz = 80; |
| 616 | int num_10ms_frames = frame_length / kFrameLen10ms48khz; |
| 617 | |
| 618 | for (i = 0; i < num_10ms_frames; i++) { |
| 619 | WebRtcSpl_Resample48khzTo8khz(speech_frame, |
| 620 | &speech_nb[i * kFrameLen10ms8khz], |
| 621 | &inst->state_48_to_8, |
| 622 | tmp_mem); |
| 623 | } |
| 624 | |
| 625 | // Do VAD on an 8 kHz signal |
| 626 | vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6); |
| 627 | |
| 628 | return vad; |
| 629 | } |
| 630 | |
andrew@webrtc.org | c2e6438 | 2014-04-30 16:44:13 +0000 | [diff] [blame] | 631 | int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame, |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 632 | int frame_length) |
| 633 | { |
| 634 | int len, vad; |
| 635 | int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB) |
| 636 | int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) |
| 637 | |
| 638 | |
| 639 | // Downsample signal 32->16->8 before doing VAD |
| 640 | WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]), |
| 641 | frame_length); |
| 642 | len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); |
| 643 | |
| 644 | WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len); |
| 645 | len = WEBRTC_SPL_RSHIFT_W16(len, 1); |
| 646 | |
| 647 | // Do VAD on an 8 kHz signal |
| 648 | vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); |
| 649 | |
| 650 | return vad; |
| 651 | } |
| 652 | |
andrew@webrtc.org | c2e6438 | 2014-04-30 16:44:13 +0000 | [diff] [blame] | 653 | int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame, |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 654 | int frame_length) |
| 655 | { |
| 656 | int len, vad; |
| 657 | int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) |
| 658 | |
| 659 | // Wideband: Downsample signal before doing VAD |
| 660 | WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states, |
| 661 | frame_length); |
| 662 | |
| 663 | len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); |
| 664 | vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); |
| 665 | |
| 666 | return vad; |
| 667 | } |
| 668 | |
andrew@webrtc.org | c2e6438 | 2014-04-30 16:44:13 +0000 | [diff] [blame] | 669 | int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame, |
andrew@webrtc.org | a7b57da | 2012-10-22 18:19:23 +0000 | [diff] [blame] | 670 | int frame_length) |
| 671 | { |
| 672 | int16_t feature_vector[kNumChannels], total_power; |
| 673 | |
| 674 | // Get power in the bands |
| 675 | total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, |
| 676 | feature_vector); |
| 677 | |
| 678 | // Make a VAD |
| 679 | inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); |
| 680 | |
| 681 | return inst->vad; |
| 682 | } |