Blame - common_audio/vad/vad_core.c - fp2-dev/platform/external/chromium_org/third_party/webrtc

blob: 98da6eaf0b7cdcab4e953f29ebe68ce45ac81804 [file] [log] [blame]

andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
pbos@webrtc.org	abf0cd8	2013-05-27 09:49:58 +0000	[diff] [blame]	11	#include "webrtc/common_audio/vad/vad_core.h"
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	12
pbos@webrtc.org	abf0cd8	2013-05-27 09:49:58 +0000	[diff] [blame]	13	#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
				14	#include "webrtc/common_audio/vad/vad_filterbank.h"
				15	#include "webrtc/common_audio/vad/vad_gmm.h"
				16	#include "webrtc/common_audio/vad/vad_sp.h"
				17	#include "webrtc/typedefs.h"
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	18
				19	// Spectrum Weighting
				20	static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
				21	static const int16_t kNoiseUpdateConst = 655; // Q15
				22	static const int16_t kSpeechUpdateConst = 6554; // Q15
				23	static const int16_t kBackEta = 154; // Q8
				24	// Minimum difference between the two models, Q5
				25	static const int16_t kMinimumDifference[kNumChannels] = {
				26	544, 544, 576, 576, 576, 576 };
				27	// Upper limit of mean value for speech model, Q7
				28	static const int16_t kMaximumSpeech[kNumChannels] = {
				29	11392, 11392, 11520, 11520, 11520, 11520 };
				30	// Minimum value for mean value
				31	static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
				32	// Upper limit of mean value for noise model, Q7
				33	static const int16_t kMaximumNoise[kNumChannels] = {
				34	9216, 9088, 8960, 8832, 8704, 8576 };
				35	// Start values for the Gaussian models, Q7
				36	// Weights for the two Gaussians for the six channels (noise)
				37	static const int16_t kNoiseDataWeights[kTableSize] = {
				38	34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
				39	// Weights for the two Gaussians for the six channels (speech)
				40	static const int16_t kSpeechDataWeights[kTableSize] = {
				41	48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
				42	// Means for the two Gaussians for the six channels (noise)
				43	static const int16_t kNoiseDataMeans[kTableSize] = {
				44	6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
				45	// Means for the two Gaussians for the six channels (speech)
				46	static const int16_t kSpeechDataMeans[kTableSize] = {
				47	8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
				48	};
				49	// Stds for the two Gaussians for the six channels (noise)
				50	static const int16_t kNoiseDataStds[kTableSize] = {
				51	378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
				52	// Stds for the two Gaussians for the six channels (speech)
				53	static const int16_t kSpeechDataStds[kTableSize] = {
				54	555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
				55
				56	// Constants used in GmmProbability().
				57	//
				58	// Maximum number of counted speech (VAD = 1) frames in a row.
				59	static const int16_t kMaxSpeechFrames = 6;
				60	// Minimum standard deviation for both speech and noise.
				61	static const int16_t kMinStd = 384;
				62
				63	// Constants in WebRtcVad_InitCore().
				64	// Default aggressiveness mode.
				65	static const short kDefaultMode = 0;
				66	static const int kInitCheck = 42;
				67
				68	// Constants used in WebRtcVad_set_mode_core().
				69	//
				70	// Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
				71	//
				72	// Mode 0, Quality.
				73	static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
				74	static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
				75	static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
				76	static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
				77	// Mode 1, Low bitrate.
				78	static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
				79	static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
				80	static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
				81	static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
				82	// Mode 2, Aggressive.
				83	static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
				84	static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
				85	static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
				86	static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
				87	// Mode 3, Very aggressive.
				88	static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
				89	static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
				90	static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
				91	static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
				92
				93	// Calculates the weighted average w.r.t. number of Gaussians. The \|data\| are
				94	// updated with an \|offset\| before averaging.
				95	//
				96	// - data [i/o] : Data to average.
				97	// - offset [i] : An offset added to \|data\|.
				98	// - weights [i] : Weights used for averaging.
				99	//
				100	// returns : The weighted average.
				101	static int32_t WeightedAverage(int16_t* data, int16_t offset,
				102	const int16_t* weights) {
				103	int k;
				104	int32_t weighted_average = 0;
				105
				106	for (k = 0; k < kNumGaussians; k++) {
				107	data[k * kNumChannels] += offset;
				108	weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
				109	}
				110	return weighted_average;
				111	}
				112
				113	// Calculates the probabilities for both speech and background noise using
				114	// Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
				115	// type of signal is most probable.
				116	//
				117	// - self [i/o] : Pointer to VAD instance
				118	// - features [i] : Feature vector of length \|kNumChannels\|
				119	// = log10(energy in frequency band)
				120	// - total_power [i] : Total power in audio frame.
				121	// - frame_length [i] : Number of input samples
				122	//
				123	// - returns : the VAD decision (0 - noise, 1 - speech).
				124	static int16_t GmmProbability(VadInstT* self, int16_t* features,
				125	int16_t total_power, int frame_length) {
				126	int channel, k;
				127	int16_t feature_minimum;
				128	int16_t h0, h1;
				129	int16_t log_likelihood_ratio;
				130	int16_t vadflag = 0;
				131	int16_t shifts_h0, shifts_h1;
				132	int16_t tmp_s16, tmp1_s16, tmp2_s16;
				133	int16_t diff;
				134	int gaussian;
				135	int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
				136	int16_t delt, ndelt;
				137	int16_t maxspe, maxmu;
				138	int16_t deltaN[kTableSize], deltaS[kTableSize];
				139	int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0.
				140	int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0.
				141	int32_t h0_test, h1_test;
				142	int32_t tmp1_s32, tmp2_s32;
				143	int32_t sum_log_likelihood_ratios = 0;
				144	int32_t noise_global_mean, speech_global_mean;
				145	int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
				146	int16_t overhead1, overhead2, individualTest, totalTest;
				147
				148	// Set various thresholds based on frame lengths (80, 160 or 240 samples).
				149	if (frame_length == 80) {
				150	overhead1 = self->over_hang_max_1[0];
				151	overhead2 = self->over_hang_max_2[0];
				152	individualTest = self->individual[0];
				153	totalTest = self->total[0];
				154	} else if (frame_length == 160) {
				155	overhead1 = self->over_hang_max_1[1];
				156	overhead2 = self->over_hang_max_2[1];
				157	individualTest = self->individual[1];
				158	totalTest = self->total[1];
				159	} else {
				160	overhead1 = self->over_hang_max_1[2];
				161	overhead2 = self->over_hang_max_2[2];
				162	individualTest = self->individual[2];
				163	totalTest = self->total[2];
				164	}
				165
				166	if (total_power > kMinEnergy) {
				167	// The signal power of current frame is large enough for processing. The
				168	// processing consists of two parts:
				169	// 1) Calculating the likelihood of speech and thereby a VAD decision.
				170	// 2) Updating the underlying model, w.r.t., the decision made.
				171
				172	// The detection scheme is an LRT with hypothesis
				173	// H0: Noise
				174	// H1: Speech
				175	//
				176	// We combine a global LRT with local tests, for each frequency sub-band,
				177	// here defined as \|channel\|.
				178	for (channel = 0; channel < kNumChannels; channel++) {
				179	// For each channel we model the probability with a GMM consisting of
				180	// \|kNumGaussians\|, with different means and standard deviations depending
				181	// on H0 or H1.
				182	h0_test = 0;
				183	h1_test = 0;
				184	for (k = 0; k < kNumGaussians; k++) {
				185	gaussian = channel + k * kNumChannels;
				186	// Probability under H0, that is, probability of frame being noise.
				187	// Value given in Q27 = Q7 * Q20.
				188	tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
				189	self->noise_means[gaussian],
				190	self->noise_stds[gaussian],
				191	&deltaN[gaussian]);
				192	noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
				193	h0_test += noise_probability[k]; // Q27
				194
				195	// Probability under H1, that is, probability of frame being speech.
				196	// Value given in Q27 = Q7 * Q20.
				197	tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
				198	self->speech_means[gaussian],
				199	self->speech_stds[gaussian],
				200	&deltaS[gaussian]);
				201	speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
				202	h1_test += speech_probability[k]; // Q27
				203	}
				204
				205	// Calculate the log likelihood ratio: log2(Pr{X\|H1} / Pr{X\|H1}).
				206	// Approximation:
				207	// log2(Pr{X\|H1} / Pr{X\|H1}) = log2(Pr{X\|H1}2^Q) - log2(Pr{X\|H1}2^Q)
				208	// = log2(h1_test) - log2(h0_test)
				209	// = log2(2^(31-shifts_h1)*(1+b1))
				210	// - log2(2^(31-shifts_h0)*(1+b0))
				211	// = shifts_h0 - shifts_h1
				212	// + log2(1+b1) - log2(1+b0)
				213	// ~= shifts_h0 - shifts_h1
				214	//
				215	// Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
				216	// Further, b0 and b1 are independent and on the average the two terms
				217	// cancel.
				218	shifts_h0 = WebRtcSpl_NormW32(h0_test);
				219	shifts_h1 = WebRtcSpl_NormW32(h1_test);
				220	if (h0_test == 0) {
				221	shifts_h0 = 31;
				222	}
				223	if (h1_test == 0) {
				224	shifts_h1 = 31;
				225	}
				226	log_likelihood_ratio = shifts_h0 - shifts_h1;
				227
				228	// Update \|sum_log_likelihood_ratios\| with spectrum weighting. This is
				229	// used for the global VAD decision.
				230	sum_log_likelihood_ratios +=
				231	(int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
				232
				233	// Local VAD decision.
				234	if ((log_likelihood_ratio << 2) > individualTest) {
				235	vadflag = 1;
				236	}
				237
				238	// TODO(bjornv): The conditional probabilities below are applied on the
				239	// hard coded number of Gaussians set to two. Find a way to generalize.
				240	// Calculate local noise probabilities used later when updating the GMM.
				241	h0 = (int16_t) (h0_test >> 12); // Q15
				242	if (h0 > 0) {
				243	// High probability of noise. Assign conditional probabilities for each
				244	// Gaussian in the GMM.
				245	tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29
				246	ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14
				247	ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
				248	} else {
				249	// Low noise probability. Assign conditional probability 1 to the first
				250	// Gaussian and 0 to the rest (which is already set at initialization).
				251	ngprvec[channel] = 16384;
				252	}
				253
				254	// Calculate local speech probabilities used later when updating the GMM.
				255	h1 = (int16_t) (h1_test >> 12); // Q15
				256	if (h1 > 0) {
				257	// High probability of speech. Assign conditional probabilities for each
				258	// Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
				259	tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29
				260	sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14
				261	sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
				262	}
				263	}
				264
				265	// Make a global VAD decision.
				266	vadflag \|= (sum_log_likelihood_ratios >= totalTest);
				267
				268	// Update the model parameters.
				269	maxspe = 12800;
				270	for (channel = 0; channel < kNumChannels; channel++) {
				271
				272	// Get minimum value in past which is used for long term correction in Q4.
				273	feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
				274
				275	// Compute the "global" mean, that is the sum of the two means weighted.
				276	noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
				277	&kNoiseDataWeights[channel]);
				278	tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8
				279
				280	for (k = 0; k < kNumGaussians; k++) {
				281	gaussian = channel + k * kNumChannels;
				282
				283	nmk = self->noise_means[gaussian];
				284	smk = self->speech_means[gaussian];
				285	nsk = self->noise_stds[gaussian];
				286	ssk = self->speech_stds[gaussian];
				287
				288	// Update noise mean vector if the frame consists of noise only.
				289	nmk2 = nmk;
				290	if (!vadflag) {
				291	// deltaN = (x-mu)/sigma^2
				292	// ngprvec[k] = \|noise_probability[k]\| /
				293	// (\|noise_probability[0]\| + \|noise_probability[1]\|)
				294
				295	// (Q14 * Q11 >> 11) = Q14.
				296	delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[gaussian],
				297	deltaN[gaussian],
				298	11);
				299	// Q7 + (Q14 * Q15 >> 22) = Q7.
				300	nmk2 = nmk + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt,
				301	kNoiseUpdateConst,
				302	22);
				303	}
				304
				305	// Long term correction of the noise mean.
				306	// Q8 - Q8 = Q8.
				307	ndelt = (feature_minimum << 4) - tmp1_s16;
				308	// Q7 + (Q8 * Q8) >> 9 = Q7.
				309	nmk3 = nmk2 + (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(ndelt, kBackEta, 9);
				310
				311	// Control that the noise mean does not drift to much.
				312	tmp_s16 = (int16_t) ((k + 5) << 7);
				313	if (nmk3 < tmp_s16) {
				314	nmk3 = tmp_s16;
				315	}
				316	tmp_s16 = (int16_t) ((72 + k - channel) << 7);
				317	if (nmk3 > tmp_s16) {
				318	nmk3 = tmp_s16;
				319	}
				320	self->noise_means[gaussian] = nmk3;
				321
				322	if (vadflag) {
				323	// Update speech mean vector:
				324	// \|deltaS\| = (x-mu)/sigma^2
				325	// sgprvec[k] = \|speech_probability[k]\| /
				326	// (\|speech_probability[0]\| + \|speech_probability[1]\|)
				327
				328	// (Q14 * Q11) >> 11 = Q14.
				329	delt = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[gaussian],
				330	deltaS[gaussian],
				331	11);
				332	// Q14 * Q15 >> 21 = Q8.
				333	tmp_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(delt,
				334	kSpeechUpdateConst,
				335	21);
				336	// Q7 + (Q8 >> 1) = Q7. With rounding.
				337	smk2 = smk + ((tmp_s16 + 1) >> 1);
				338
				339	// Control that the speech mean does not drift to much.
				340	maxmu = maxspe + 640;
				341	if (smk2 < kMinimumMean[k]) {
				342	smk2 = kMinimumMean[k];
				343	}
				344	if (smk2 > maxmu) {
				345	smk2 = maxmu;
				346	}
				347	self->speech_means[gaussian] = smk2; // Q7.
				348
				349	// (Q7 >> 3) = Q4. With rounding.
				350	tmp_s16 = ((smk + 4) >> 3);
				351
				352	tmp_s16 = features[channel] - tmp_s16; // Q4
				353	// (Q11 * Q4 >> 3) = Q12.
				354	tmp1_s32 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[gaussian], tmp_s16, 3);
				355	tmp2_s32 = tmp1_s32 - 4096;
				356	tmp_s16 = sgprvec[gaussian] >> 2;
				357	// (Q14 >> 2) * Q12 = Q24.
				358	tmp1_s32 = tmp_s16 * tmp2_s32;
				359
				360	tmp2_s32 = tmp1_s32 >> 4; // Q20
				361
				362	// 0.1 * Q20 / Q7 = Q13.
				363	if (tmp2_s32 > 0) {
				364	tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
				365	} else {
				366	tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
				367	tmp_s16 = -tmp_s16;
				368	}
				369	// Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
				370	// Note that division by 4 equals shift by 2, hence,
				371	// (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
				372	tmp_s16 += 128; // Rounding.
				373	ssk += (tmp_s16 >> 8);
				374	if (ssk < kMinStd) {
				375	ssk = kMinStd;
				376	}
				377	self->speech_stds[gaussian] = ssk;
				378	} else {
				379	// Update GMM variance vectors.
				380	// deltaN * (features[channel] - nmk) - 1
				381	// Q4 - (Q7 >> 3) = Q4.
				382	tmp_s16 = features[channel] - (nmk >> 3);
				383	// (Q11 * Q4 >> 3) = Q12.
				384	tmp1_s32 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[gaussian], tmp_s16, 3);
				385	tmp1_s32 -= 4096;
				386
				387	// (Q14 >> 2) * Q12 = Q24.
				388	tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
				389	tmp2_s32 = tmp_s16 * tmp1_s32;
				390	// Q20 * approx 0.001 (2^-10=0.0009766), hence,
				391	// (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
				392	tmp1_s32 = tmp2_s32 >> 14;
				393
				394	// Q20 / Q7 = Q13.
				395	if (tmp1_s32 > 0) {
				396	tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
				397	} else {
				398	tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
				399	tmp_s16 = -tmp_s16;
				400	}
				401	tmp_s16 += 32; // Rounding
				402	nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7.
				403	if (nsk < kMinStd) {
				404	nsk = kMinStd;
				405	}
				406	self->noise_stds[gaussian] = nsk;
				407	}
				408	}
				409
				410	// Separate models if they are too close.
				411	// \|noise_global_mean\| in Q14 (= Q7 * Q7).
				412	noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
				413	&kNoiseDataWeights[channel]);
				414
				415	// \|speech_global_mean\| in Q14 (= Q7 * Q7).
				416	speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
				417	&kSpeechDataWeights[channel]);
				418
				419	// \|diff\| = "global" speech mean - "global" noise mean.
				420	// (Q14 >> 9) - (Q14 >> 9) = Q5.
				421	diff = (int16_t) (speech_global_mean >> 9) -
				422	(int16_t) (noise_global_mean >> 9);
				423	if (diff < kMinimumDifference[channel]) {
				424	tmp_s16 = kMinimumDifference[channel] - diff;
				425
				426	// \|tmp1_s16\| = ~0.8 * (kMinimumDifference - diff) in Q7.
				427	// \|tmp2_s16\| = ~0.2 * (kMinimumDifference - diff) in Q7.
				428	tmp1_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(13, tmp_s16, 2);
				429	tmp2_s16 = (int16_t) WEBRTC_SPL_MUL_16_16_RSFT(3, tmp_s16, 2);
				430
				431	// Move Gaussian means for speech model by \|tmp1_s16\| and update
				432	// \|speech_global_mean\|. Note that \|self->speech_means[channel]\| is
				433	// changed after the call.
				434	speech_global_mean = WeightedAverage(&self->speech_means[channel],
				435	tmp1_s16,
				436	&kSpeechDataWeights[channel]);
				437
				438	// Move Gaussian means for noise model by -\|tmp2_s16\| and update
				439	// \|noise_global_mean\|. Note that \|self->noise_means[channel]\| is
				440	// changed after the call.
				441	noise_global_mean = WeightedAverage(&self->noise_means[channel],
				442	-tmp2_s16,
				443	&kNoiseDataWeights[channel]);
				444	}
				445
				446	// Control that the speech & noise means do not drift to much.
				447	maxspe = kMaximumSpeech[channel];
				448	tmp2_s16 = (int16_t) (speech_global_mean >> 7);
				449	if (tmp2_s16 > maxspe) {
				450	// Upper limit of speech model.
				451	tmp2_s16 -= maxspe;
				452
				453	for (k = 0; k < kNumGaussians; k++) {
				454	self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
				455	}
				456	}
				457
				458	tmp2_s16 = (int16_t) (noise_global_mean >> 7);
				459	if (tmp2_s16 > kMaximumNoise[channel]) {
				460	tmp2_s16 -= kMaximumNoise[channel];
				461
				462	for (k = 0; k < kNumGaussians; k++) {
				463	self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
				464	}
				465	}
				466	}
				467	self->frame_counter++;
				468	}
				469
				470	// Smooth with respect to transition hysteresis.
				471	if (!vadflag) {
				472	if (self->over_hang > 0) {
				473	vadflag = 2 + self->over_hang;
				474	self->over_hang--;
				475	}
				476	self->num_of_speech = 0;
				477	} else {
				478	self->num_of_speech++;
				479	if (self->num_of_speech > kMaxSpeechFrames) {
				480	self->num_of_speech = kMaxSpeechFrames;
				481	self->over_hang = overhead2;
				482	} else {
				483	self->over_hang = overhead1;
				484	}
				485	}
				486	return vadflag;
				487	}
				488
				489	// Initialize the VAD. Set aggressiveness mode to default value.
				490	int WebRtcVad_InitCore(VadInstT* self) {
				491	int i;
				492
				493	if (self == NULL) {
				494	return -1;
				495	}
				496
				497	// Initialization of general struct variables.
				498	self->vad = 1; // Speech active (=1).
				499	self->frame_counter = 0;
				500	self->over_hang = 0;
				501	self->num_of_speech = 0;
				502
				503	// Initialization of downsampling filter state.
				504	memset(self->downsampling_filter_states, 0,
				505	sizeof(self->downsampling_filter_states));
				506
				507	// Initialization of 48 to 8 kHz downsampling.
				508	WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
				509
				510	// Read initial PDF parameters.
				511	for (i = 0; i < kTableSize; i++) {
				512	self->noise_means[i] = kNoiseDataMeans[i];
				513	self->speech_means[i] = kSpeechDataMeans[i];
				514	self->noise_stds[i] = kNoiseDataStds[i];
				515	self->speech_stds[i] = kSpeechDataStds[i];
				516	}
				517
				518	// Initialize Index and Minimum value vectors.
				519	for (i = 0; i < 16 * kNumChannels; i++) {
				520	self->low_value_vector[i] = 10000;
				521	self->index_vector[i] = 0;
				522	}
				523
				524	// Initialize splitting filter states.
				525	memset(self->upper_state, 0, sizeof(self->upper_state));
				526	memset(self->lower_state, 0, sizeof(self->lower_state));
				527
				528	// Initialize high pass filter states.
				529	memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
				530
				531	// Initialize mean value memory, for WebRtcVad_FindMinimum().
				532	for (i = 0; i < kNumChannels; i++) {
				533	self->mean_value[i] = 1600;
				534	}
				535
				536	// Set aggressiveness mode to default (=\|kDefaultMode\|).
				537	if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
				538	return -1;
				539	}
				540
				541	self->init_flag = kInitCheck;
				542
				543	return 0;
				544	}
				545
				546	// Set aggressiveness mode
				547	int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
				548	int return_value = 0;
				549
				550	switch (mode) {
				551	case 0:
				552	// Quality mode.
				553	memcpy(self->over_hang_max_1, kOverHangMax1Q,
				554	sizeof(self->over_hang_max_1));
				555	memcpy(self->over_hang_max_2, kOverHangMax2Q,
				556	sizeof(self->over_hang_max_2));
				557	memcpy(self->individual, kLocalThresholdQ,
				558	sizeof(self->individual));
				559	memcpy(self->total, kGlobalThresholdQ,
				560	sizeof(self->total));
				561	break;
				562	case 1:
				563	// Low bitrate mode.
				564	memcpy(self->over_hang_max_1, kOverHangMax1LBR,
				565	sizeof(self->over_hang_max_1));
				566	memcpy(self->over_hang_max_2, kOverHangMax2LBR,
				567	sizeof(self->over_hang_max_2));
				568	memcpy(self->individual, kLocalThresholdLBR,
				569	sizeof(self->individual));
				570	memcpy(self->total, kGlobalThresholdLBR,
				571	sizeof(self->total));
				572	break;
				573	case 2:
				574	// Aggressive mode.
				575	memcpy(self->over_hang_max_1, kOverHangMax1AGG,
				576	sizeof(self->over_hang_max_1));
				577	memcpy(self->over_hang_max_2, kOverHangMax2AGG,
				578	sizeof(self->over_hang_max_2));
				579	memcpy(self->individual, kLocalThresholdAGG,
				580	sizeof(self->individual));
				581	memcpy(self->total, kGlobalThresholdAGG,
				582	sizeof(self->total));
				583	break;
				584	case 3:
				585	// Very aggressive mode.
				586	memcpy(self->over_hang_max_1, kOverHangMax1VAG,
				587	sizeof(self->over_hang_max_1));
				588	memcpy(self->over_hang_max_2, kOverHangMax2VAG,
				589	sizeof(self->over_hang_max_2));
				590	memcpy(self->individual, kLocalThresholdVAG,
				591	sizeof(self->individual));
				592	memcpy(self->total, kGlobalThresholdVAG,
				593	sizeof(self->total));
				594	break;
				595	default:
				596	return_value = -1;
				597	break;
				598	}
				599
				600	return return_value;
				601	}
				602
				603	// Calculate VAD decision by first extracting feature values and then calculate
				604	// probability for both speech and background noise.
				605
andrew@webrtc.org	c2e6438	2014-04-30 16:44:13 +0000	[diff] [blame]	606	int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	607	int frame_length) {
				608	int vad;
				609	int i;
				610	int16_t speech_nb[240]; // 30 ms in 8 kHz.
				611	// \|tmp_mem\| is a temporary memory used by resample function, length is
				612	// frame length in 10 ms (480 samples) + 256 extra.
				613	int32_t tmp_mem[480 + 256] = { 0 };
				614	const int kFrameLen10ms48khz = 480;
				615	const int kFrameLen10ms8khz = 80;
				616	int num_10ms_frames = frame_length / kFrameLen10ms48khz;
				617
				618	for (i = 0; i < num_10ms_frames; i++) {
				619	WebRtcSpl_Resample48khzTo8khz(speech_frame,
				620	&speech_nb[i * kFrameLen10ms8khz],
				621	&inst->state_48_to_8,
				622	tmp_mem);
				623	}
				624
				625	// Do VAD on an 8 kHz signal
				626	vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
				627
				628	return vad;
				629	}
				630
andrew@webrtc.org	c2e6438	2014-04-30 16:44:13 +0000	[diff] [blame]	631	int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	632	int frame_length)
				633	{
				634	int len, vad;
				635	int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
				636	int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
				637
				638
				639	// Downsample signal 32->16->8 before doing VAD
				640	WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
				641	frame_length);
				642	len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
				643
				644	WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
				645	len = WEBRTC_SPL_RSHIFT_W16(len, 1);
				646
				647	// Do VAD on an 8 kHz signal
				648	vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
				649
				650	return vad;
				651	}
				652
andrew@webrtc.org	c2e6438	2014-04-30 16:44:13 +0000	[diff] [blame]	653	int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	654	int frame_length)
				655	{
				656	int len, vad;
				657	int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
				658
				659	// Wideband: Downsample signal before doing VAD
				660	WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
				661	frame_length);
				662
				663	len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
				664	vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
				665
				666	return vad;
				667	}
				668
andrew@webrtc.org	c2e6438	2014-04-30 16:44:13 +0000	[diff] [blame]	669	int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
andrew@webrtc.org	a7b57da	2012-10-22 18:19:23 +0000	[diff] [blame]	670	int frame_length)
				671	{
				672	int16_t feature_vector[kNumChannels], total_power;
				673
				674	// Get power in the bands
				675	total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
				676	feature_vector);
				677
				678	// Make a VAD
				679	inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
				680
				681	return inst->vad;
				682	}