blob: 98da6eaf0b7cdcab4e953f29ebe68ce45ac81804 [file] [log] [blame]
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +00001/*
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
pbos@webrtc.orgabf0cd82013-05-27 09:49:58 +000011#include "webrtc/common_audio/vad/vad_core.h"
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000012
pbos@webrtc.orgabf0cd82013-05-27 09:49:58 +000013#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
14#include "webrtc/common_audio/vad/vad_filterbank.h"
15#include "webrtc/common_audio/vad/vad_gmm.h"
16#include "webrtc/common_audio/vad/vad_sp.h"
17#include "webrtc/typedefs.h"
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +000018
19// Spectrum Weighting
20static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
21static const int16_t kNoiseUpdateConst = 655; // Q15
22static const int16_t kSpeechUpdateConst = 6554; // Q15
23static const int16_t kBackEta = 154; // Q8
24// Minimum difference between the two models, Q5
25static const int16_t kMinimumDifference[kNumChannels] = {
26 544, 544, 576, 576, 576, 576 };
27// Upper limit of mean value for speech model, Q7
28static const int16_t kMaximumSpeech[kNumChannels] = {
29 11392, 11392, 11520, 11520, 11520, 11520 };
30// Minimum value for mean value
31static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
32// Upper limit of mean value for noise model, Q7
33static const int16_t kMaximumNoise[kNumChannels] = {
34 9216, 9088, 8960, 8832, 8704, 8576 };
35// Start values for the Gaussian models, Q7
36// Weights for the two Gaussians for the six channels (noise)
37static const int16_t kNoiseDataWeights[kTableSize] = {
38 34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
39// Weights for the two Gaussians for the six channels (speech)
40static const int16_t kSpeechDataWeights[kTableSize] = {
41 48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
42// Means for the two Gaussians for the six channels (noise)
43static const int16_t kNoiseDataMeans[kTableSize] = {
44 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
45// Means for the two Gaussians for the six channels (speech)
46static const int16_t kSpeechDataMeans[kTableSize] = {
47 8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
48};
49// Stds for the two Gaussians for the six channels (noise)
50static const int16_t kNoiseDataStds[kTableSize] = {
51 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
52// Stds for the two Gaussians for the six channels (speech)
53static const int16_t kSpeechDataStds[kTableSize] = {
54 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
55
56// Constants used in GmmProbability().
57//
58// Maximum number of counted speech (VAD = 1) frames in a row.
59static const int16_t kMaxSpeechFrames = 6;
60// Minimum standard deviation for both speech and noise.
61static const int16_t kMinStd = 384;
62
63// Constants in WebRtcVad_InitCore().
64// Default aggressiveness mode.
65static const short kDefaultMode = 0;
66static const int kInitCheck = 42;
67
68// Constants used in WebRtcVad_set_mode_core().
69//
70// Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
71//
72// Mode 0, Quality.
73static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
74static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
75static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
76static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
77// Mode 1, Low bitrate.
78static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
79static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
80static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
81static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
82// Mode 2, Aggressive.
83static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
84static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
85static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
86static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
87// Mode 3, Very aggressive.
88static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
89static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
90static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
91static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
92
93// Calculates the weighted average w.r.t. number of Gaussians. The |data| are
94// updated with an |offset| before averaging.
95//
96// - data [i/o] : Data to average.
97// - offset [i] : An offset added to |data|.
98// - weights [i] : Weights used for averaging.
99//
100// returns : The weighted average.
101static int32_t WeightedAverage(int16_t* data, int16_t offset,
102 const int16_t* weights) {
103 int k;
104 int32_t weighted_average = 0;
105
106 for (k = 0; k < kNumGaussians; k++) {
107 data[k * kNumChannels] += offset;
108 weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
109 }
110 return weighted_average;
111}
112
113// Calculates the probabilities for both speech and background noise using
114// Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
115// type of signal is most probable.
116//
117// - self [i/o] : Pointer to VAD instance
118// - features [i] : Feature vector of length |kNumChannels|
119// = log10(energy in frequency band)
120// - total_power [i] : Total power in audio frame.
121// - frame_length [i] : Number of input samples
122//
123// - returns : the VAD decision (0 - noise, 1 - speech).
124static int16_t GmmProbability(VadInstT* self, int16_t* features,
125 int16_t total_power, int frame_length) {
126 int channel, k;
127 int16_t feature_minimum;
128 int16_t h0, h1;
129 int16_t log_likelihood_ratio;
130 int16_t vadflag = 0;
131 int16_t shifts_h0, shifts_h1;
132 int16_t tmp_s16, tmp1_s16, tmp2_s16;
133 int16_t diff;
134 int gaussian;
135 int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
136 int16_t delt, ndelt;
137 int16_t maxspe, maxmu;
138 int16_t deltaN[kTableSize], deltaS[kTableSize];
139 int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0.
140 int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0.
141 int32_t h0_test, h1_test;
142 int32_t tmp1_s32, tmp2_s32;
143 int32_t sum_log_likelihood_ratios = 0;
144 int32_t noise_global_mean, speech_global_mean;
145 int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
146 int16_t overhead1, overhead2, individualTest, totalTest;
147
148 // Set various thresholds based on frame lengths (80, 160 or 240 samples).
149 if (frame_length == 80) {
150 overhead1 = self->over_hang_max_1[0];
151 overhead2 = self->over_hang_max_2[0];
152 individualTest = self->individual[0];
153 totalTest = self->total[0];
154 } else if (frame_length == 160) {
155 overhead1 = self->over_hang_max_1[1];
156 overhead2 = self->over_hang_max_2[1];
157 individualTest = self->individual[1];
158 totalTest = self->total[1];
159 } else {
160 overhead1 = self->over_hang_max_1[2];
161 overhead2 = self->over_hang_max_2[2];
162 individualTest = self->individual[2];
163 totalTest = self->total[2];
164 }
165
166 if (total_power > kMinEnergy) {
167 // The signal power of current frame is large enough for processing. The
168 // processing consists of two parts:
169 // 1) Calculating the likelihood of speech and thereby a VAD decision.
170 // 2) Updating the underlying model, w.r.t., the decision made.
171
172 // The detection scheme is an LRT with hypothesis
173 // H0: Noise
174 // H1: Speech
175 //
176 // We combine a global LRT with local tests, for each frequency sub-band,
177 // here defined as |channel|.
178 for (channel = 0; channel < kNumChannels; channel++) {
179 // For each channel we model the probability with a GMM consisting of
180 // |kNumGaussians|, with different means and standard deviations depending
181 // on H0 or H1.
182 h0_test = 0;
183 h1_test = 0;
184 for (k = 0; k < kNumGaussians; k++) {
185 gaussian = channel + k * kNumChannels;
186 // Probability under H0, that is, probability of frame being noise.
187 // Value given in Q27 = Q7 * Q20.
188 tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
189 self->noise_means[gaussian],
190 self->noise_stds[gaussian],
191 &deltaN[gaussian]);
192 noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
193 h0_test += noise_probability[k]; // Q27
194
195 // Probability under H1, that is, probability of frame being speech.
196 // Value given in Q27 = Q7 * Q20.
197 tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
198 self->speech_means[gaussian],
199 self->speech_stds[gaussian],
200 &deltaS[gaussian]);
201 speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
202 h1_test += speech_probability[k]; // Q27
203 }
204
205 // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
206 // Approximation:
207 // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
208 // = log2(h1_test) - log2(h0_test)
209 // = log2(2^(31-shifts_h1)*(1+b1))
210 // - log2(2^(31-shifts_h0)*(1+b0))
211 // = shifts_h0 - shifts_h1
212 // + log2(1+b1) - log2(1+b0)
213 // ~= shifts_h0 - shifts_h1
214 //
215 // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
216 // Further, b0 and b1 are independent and on the average the two terms
217 // cancel.
218 shifts_h0 = WebRtcSpl_NormW32(h0_test);
219 shifts_h1 = WebRtcSpl_NormW32(h1_test);
220 if (h0_test == 0) {
221 shifts_h0 = 31;
222 }
223 if (h1_test == 0) {
224 shifts_h1 = 31;
225 }
226 log_likelihood_ratio = shifts_h0 - shifts_h1;
227
228 // Update |sum_log_likelihood_ratios| with spectrum weighting. This is
229 // used for the global VAD decision.
230 sum_log_likelihood_ratios +=
231 (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
232
233 // Local VAD decision.
234 if ((log_likelihood_ratio << 2) > individualTest) {
235 vadflag = 1;
236 }
237
238 // TODO(bjornv): The conditional probabilities below are applied on the
239 // hard coded number of Gaussians set to two. Find a way to generalize.
240 // Calculate local noise probabilities used later when updating the GMM.
241 h0 = (int16_t) (h0_test >> 12); // Q15
242 if (h0 > 0) {
243 // High probability of noise. Assign conditional probabilities for each
244 // Gaussian in the GMM.
245 tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29
246 ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14
247 ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
248 } else {
249 // Low noise probability. Assign conditional probability 1 to the first
250 // Gaussian and 0 to the rest (which is already set at initialization).
251 ngprvec[channel] = 16384;
252 }
253
254 // Calculate local speech probabilities used later when updating the GMM.
255 h1 = (int16_t) (h1_test >> 12); // Q15
256 if (h1 > 0) {
257 // High probability of speech. Assign conditional probabilities for each
258 // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
259 tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29
260 sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14
261 sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
262 }
263 }
264
265 // Make a global VAD decision.
266 vadflag |= (sum_log_likelihood_ratios >= totalTest);
267
268 // Update the model parameters.
269 maxspe = 12800;
270 for (channel = 0; channel < kNumChannels; channel++) {
271
272 // Get minimum value in past which is used for long term correction in Q4.
273 feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
274
275 // Compute the "global" mean, that is the sum of the two means weighted.
276 noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
277 &kNoiseDataWeights[channel]);
278 tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8
279
280 for (k = 0; k < kNumGaussians; k++) {
281 gaussian = channel + k * kNumChannels;
282
283 nmk = self->noise_means[gaussian];
284 smk = self->speech_means[gaussian];
285 nsk = self->noise_stds[gaussian];
286 ssk = self->speech_stds[gaussian];
287
288 // Update noise mean vector if the frame consists of noise only.
289 nmk2 = nmk;
290 if (!vadflag) {
291 // deltaN = (x-mu)/sigma^2
292 // ngprvec[k] = |noise_probability[k]| /
293 // (|noise_probability[0]| + |noise_probability[1]|)
294
295 // (Q14 * Q11 >> 11) = Q14.
296 delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[gaussian],
297 deltaN[gaussian],
298 11);
299 // Q7 + (Q14 * Q15 >> 22) = Q7.
300 nmk2 = nmk + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt,
301 kNoiseUpdateConst,
302 22);
303 }
304
305 // Long term correction of the noise mean.
306 // Q8 - Q8 = Q8.
307 ndelt = (feature_minimum << 4) - tmp1_s16;
308 // Q7 + (Q8 * Q8) >> 9 = Q7.
309 nmk3 = nmk2 + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ndelt, kBackEta, 9);
310
311 // Control that the noise mean does not drift to much.
312 tmp_s16 = (int16_t) ((k + 5) << 7);
313 if (nmk3 < tmp_s16) {
314 nmk3 = tmp_s16;
315 }
316 tmp_s16 = (int16_t) ((72 + k - channel) << 7);
317 if (nmk3 > tmp_s16) {
318 nmk3 = tmp_s16;
319 }
320 self->noise_means[gaussian] = nmk3;
321
322 if (vadflag) {
323 // Update speech mean vector:
324 // |deltaS| = (x-mu)/sigma^2
325 // sgprvec[k] = |speech_probability[k]| /
326 // (|speech_probability[0]| + |speech_probability[1]|)
327
328 // (Q14 * Q11) >> 11 = Q14.
329 delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[gaussian],
330 deltaS[gaussian],
331 11);
332 // Q14 * Q15 >> 21 = Q8.
333 tmp_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt,
334 kSpeechUpdateConst,
335 21);
336 // Q7 + (Q8 >> 1) = Q7. With rounding.
337 smk2 = smk + ((tmp_s16 + 1) >> 1);
338
339 // Control that the speech mean does not drift to much.
340 maxmu = maxspe + 640;
341 if (smk2 < kMinimumMean[k]) {
342 smk2 = kMinimumMean[k];
343 }
344 if (smk2 > maxmu) {
345 smk2 = maxmu;
346 }
347 self->speech_means[gaussian] = smk2; // Q7.
348
349 // (Q7 >> 3) = Q4. With rounding.
350 tmp_s16 = ((smk + 4) >> 3);
351
352 tmp_s16 = features[channel] - tmp_s16; // Q4
353 // (Q11 * Q4 >> 3) = Q12.
354 tmp1_s32 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[gaussian], tmp_s16, 3);
355 tmp2_s32 = tmp1_s32 - 4096;
356 tmp_s16 = sgprvec[gaussian] >> 2;
357 // (Q14 >> 2) * Q12 = Q24.
358 tmp1_s32 = tmp_s16 * tmp2_s32;
359
360 tmp2_s32 = tmp1_s32 >> 4; // Q20
361
362 // 0.1 * Q20 / Q7 = Q13.
363 if (tmp2_s32 > 0) {
364 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
365 } else {
366 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
367 tmp_s16 = -tmp_s16;
368 }
369 // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
370 // Note that division by 4 equals shift by 2, hence,
371 // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
372 tmp_s16 += 128; // Rounding.
373 ssk += (tmp_s16 >> 8);
374 if (ssk < kMinStd) {
375 ssk = kMinStd;
376 }
377 self->speech_stds[gaussian] = ssk;
378 } else {
379 // Update GMM variance vectors.
380 // deltaN * (features[channel] - nmk) - 1
381 // Q4 - (Q7 >> 3) = Q4.
382 tmp_s16 = features[channel] - (nmk >> 3);
383 // (Q11 * Q4 >> 3) = Q12.
384 tmp1_s32 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[gaussian], tmp_s16, 3);
385 tmp1_s32 -= 4096;
386
387 // (Q14 >> 2) * Q12 = Q24.
388 tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
389 tmp2_s32 = tmp_s16 * tmp1_s32;
390 // Q20 * approx 0.001 (2^-10=0.0009766), hence,
391 // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
392 tmp1_s32 = tmp2_s32 >> 14;
393
394 // Q20 / Q7 = Q13.
395 if (tmp1_s32 > 0) {
396 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
397 } else {
398 tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
399 tmp_s16 = -tmp_s16;
400 }
401 tmp_s16 += 32; // Rounding
402 nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7.
403 if (nsk < kMinStd) {
404 nsk = kMinStd;
405 }
406 self->noise_stds[gaussian] = nsk;
407 }
408 }
409
410 // Separate models if they are too close.
411 // |noise_global_mean| in Q14 (= Q7 * Q7).
412 noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
413 &kNoiseDataWeights[channel]);
414
415 // |speech_global_mean| in Q14 (= Q7 * Q7).
416 speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
417 &kSpeechDataWeights[channel]);
418
419 // |diff| = "global" speech mean - "global" noise mean.
420 // (Q14 >> 9) - (Q14 >> 9) = Q5.
421 diff = (int16_t) (speech_global_mean >> 9) -
422 (int16_t) (noise_global_mean >> 9);
423 if (diff < kMinimumDifference[channel]) {
424 tmp_s16 = kMinimumDifference[channel] - diff;
425
426 // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7.
427 // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7.
428 tmp1_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(13, tmp_s16, 2);
429 tmp2_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(3, tmp_s16, 2);
430
431 // Move Gaussian means for speech model by |tmp1_s16| and update
432 // |speech_global_mean|. Note that |self->speech_means[channel]| is
433 // changed after the call.
434 speech_global_mean = WeightedAverage(&self->speech_means[channel],
435 tmp1_s16,
436 &kSpeechDataWeights[channel]);
437
438 // Move Gaussian means for noise model by -|tmp2_s16| and update
439 // |noise_global_mean|. Note that |self->noise_means[channel]| is
440 // changed after the call.
441 noise_global_mean = WeightedAverage(&self->noise_means[channel],
442 -tmp2_s16,
443 &kNoiseDataWeights[channel]);
444 }
445
446 // Control that the speech & noise means do not drift to much.
447 maxspe = kMaximumSpeech[channel];
448 tmp2_s16 = (int16_t) (speech_global_mean >> 7);
449 if (tmp2_s16 > maxspe) {
450 // Upper limit of speech model.
451 tmp2_s16 -= maxspe;
452
453 for (k = 0; k < kNumGaussians; k++) {
454 self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
455 }
456 }
457
458 tmp2_s16 = (int16_t) (noise_global_mean >> 7);
459 if (tmp2_s16 > kMaximumNoise[channel]) {
460 tmp2_s16 -= kMaximumNoise[channel];
461
462 for (k = 0; k < kNumGaussians; k++) {
463 self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
464 }
465 }
466 }
467 self->frame_counter++;
468 }
469
470 // Smooth with respect to transition hysteresis.
471 if (!vadflag) {
472 if (self->over_hang > 0) {
473 vadflag = 2 + self->over_hang;
474 self->over_hang--;
475 }
476 self->num_of_speech = 0;
477 } else {
478 self->num_of_speech++;
479 if (self->num_of_speech > kMaxSpeechFrames) {
480 self->num_of_speech = kMaxSpeechFrames;
481 self->over_hang = overhead2;
482 } else {
483 self->over_hang = overhead1;
484 }
485 }
486 return vadflag;
487}
488
489// Initialize the VAD. Set aggressiveness mode to default value.
490int WebRtcVad_InitCore(VadInstT* self) {
491 int i;
492
493 if (self == NULL) {
494 return -1;
495 }
496
497 // Initialization of general struct variables.
498 self->vad = 1; // Speech active (=1).
499 self->frame_counter = 0;
500 self->over_hang = 0;
501 self->num_of_speech = 0;
502
503 // Initialization of downsampling filter state.
504 memset(self->downsampling_filter_states, 0,
505 sizeof(self->downsampling_filter_states));
506
507 // Initialization of 48 to 8 kHz downsampling.
508 WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
509
510 // Read initial PDF parameters.
511 for (i = 0; i < kTableSize; i++) {
512 self->noise_means[i] = kNoiseDataMeans[i];
513 self->speech_means[i] = kSpeechDataMeans[i];
514 self->noise_stds[i] = kNoiseDataStds[i];
515 self->speech_stds[i] = kSpeechDataStds[i];
516 }
517
518 // Initialize Index and Minimum value vectors.
519 for (i = 0; i < 16 * kNumChannels; i++) {
520 self->low_value_vector[i] = 10000;
521 self->index_vector[i] = 0;
522 }
523
524 // Initialize splitting filter states.
525 memset(self->upper_state, 0, sizeof(self->upper_state));
526 memset(self->lower_state, 0, sizeof(self->lower_state));
527
528 // Initialize high pass filter states.
529 memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
530
531 // Initialize mean value memory, for WebRtcVad_FindMinimum().
532 for (i = 0; i < kNumChannels; i++) {
533 self->mean_value[i] = 1600;
534 }
535
536 // Set aggressiveness mode to default (=|kDefaultMode|).
537 if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
538 return -1;
539 }
540
541 self->init_flag = kInitCheck;
542
543 return 0;
544}
545
546// Set aggressiveness mode
547int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
548 int return_value = 0;
549
550 switch (mode) {
551 case 0:
552 // Quality mode.
553 memcpy(self->over_hang_max_1, kOverHangMax1Q,
554 sizeof(self->over_hang_max_1));
555 memcpy(self->over_hang_max_2, kOverHangMax2Q,
556 sizeof(self->over_hang_max_2));
557 memcpy(self->individual, kLocalThresholdQ,
558 sizeof(self->individual));
559 memcpy(self->total, kGlobalThresholdQ,
560 sizeof(self->total));
561 break;
562 case 1:
563 // Low bitrate mode.
564 memcpy(self->over_hang_max_1, kOverHangMax1LBR,
565 sizeof(self->over_hang_max_1));
566 memcpy(self->over_hang_max_2, kOverHangMax2LBR,
567 sizeof(self->over_hang_max_2));
568 memcpy(self->individual, kLocalThresholdLBR,
569 sizeof(self->individual));
570 memcpy(self->total, kGlobalThresholdLBR,
571 sizeof(self->total));
572 break;
573 case 2:
574 // Aggressive mode.
575 memcpy(self->over_hang_max_1, kOverHangMax1AGG,
576 sizeof(self->over_hang_max_1));
577 memcpy(self->over_hang_max_2, kOverHangMax2AGG,
578 sizeof(self->over_hang_max_2));
579 memcpy(self->individual, kLocalThresholdAGG,
580 sizeof(self->individual));
581 memcpy(self->total, kGlobalThresholdAGG,
582 sizeof(self->total));
583 break;
584 case 3:
585 // Very aggressive mode.
586 memcpy(self->over_hang_max_1, kOverHangMax1VAG,
587 sizeof(self->over_hang_max_1));
588 memcpy(self->over_hang_max_2, kOverHangMax2VAG,
589 sizeof(self->over_hang_max_2));
590 memcpy(self->individual, kLocalThresholdVAG,
591 sizeof(self->individual));
592 memcpy(self->total, kGlobalThresholdVAG,
593 sizeof(self->total));
594 break;
595 default:
596 return_value = -1;
597 break;
598 }
599
600 return return_value;
601}
602
603// Calculate VAD decision by first extracting feature values and then calculate
604// probability for both speech and background noise.
605
andrew@webrtc.orgc2e64382014-04-30 16:44:13 +0000606int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000607 int frame_length) {
608 int vad;
609 int i;
610 int16_t speech_nb[240]; // 30 ms in 8 kHz.
611 // |tmp_mem| is a temporary memory used by resample function, length is
612 // frame length in 10 ms (480 samples) + 256 extra.
613 int32_t tmp_mem[480 + 256] = { 0 };
614 const int kFrameLen10ms48khz = 480;
615 const int kFrameLen10ms8khz = 80;
616 int num_10ms_frames = frame_length / kFrameLen10ms48khz;
617
618 for (i = 0; i < num_10ms_frames; i++) {
619 WebRtcSpl_Resample48khzTo8khz(speech_frame,
620 &speech_nb[i * kFrameLen10ms8khz],
621 &inst->state_48_to_8,
622 tmp_mem);
623 }
624
625 // Do VAD on an 8 kHz signal
626 vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
627
628 return vad;
629}
630
andrew@webrtc.orgc2e64382014-04-30 16:44:13 +0000631int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000632 int frame_length)
633{
634 int len, vad;
635 int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
636 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
637
638
639 // Downsample signal 32->16->8 before doing VAD
640 WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
641 frame_length);
642 len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
643
644 WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
645 len = WEBRTC_SPL_RSHIFT_W16(len, 1);
646
647 // Do VAD on an 8 kHz signal
648 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
649
650 return vad;
651}
652
andrew@webrtc.orgc2e64382014-04-30 16:44:13 +0000653int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000654 int frame_length)
655{
656 int len, vad;
657 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
658
659 // Wideband: Downsample signal before doing VAD
660 WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
661 frame_length);
662
663 len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
664 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
665
666 return vad;
667}
668
andrew@webrtc.orgc2e64382014-04-30 16:44:13 +0000669int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
andrew@webrtc.orga7b57da2012-10-22 18:19:23 +0000670 int frame_length)
671{
672 int16_t feature_vector[kNumChannels], total_power;
673
674 // Get power in the bands
675 total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
676 feature_vector);
677
678 // Make a VAD
679 inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
680
681 return inst->vad;
682}