Blame - modules/audio_coding/neteq4/time_stretch.cc - fp2-dev/platform/external/chromium_org/third_party/webrtc

blob: 7b63ac3241256eeb999e599c6fba3ff2ea75a8f5 [file] [log] [blame]

henrik.lundin@webrtc.org	9a40081	2013-01-29 12:09:21 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
				3	*
				4	* Use of this source code is governed by a BSD-style license
				5	* that can be found in the LICENSE file in the root of the source
				6	* tree. An additional intellectual property rights grant can be found
				7	* in the file PATENTS. All contributing project authors may
				8	* be found in the AUTHORS file in the root of the source tree.
				9	*/
				10
				11	#include "webrtc/modules/audio_coding/neteq4/time_stretch.h"
				12
				13	#include <algorithm> // min, max
				14
				15	#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
				16	#include "webrtc/modules/audio_coding/neteq4/background_noise.h"
				17	#include "webrtc/modules/audio_coding/neteq4/dsp_helper.h"
				18	#include "webrtc/system_wrappers/interface/scoped_ptr.h"
				19
				20	namespace webrtc {
				21
				22	TimeStretch::ReturnCodes TimeStretch::Process(
				23	const int16_t* input,
				24	size_t input_len,
				25	AudioMultiVector<int16_t>* output,
				26	int16_t* length_change_samples) {
				27
				28	// Pre-calculate common multiplication with \|fs_mult_\|.
				29	int fs_mult_120 = fs_mult_ * 120; // Corresponds to 15 ms.
				30
				31	const int16_t* signal;
				32	scoped_array<int16_t> signal_array;
				33	size_t signal_len;
				34	if (num_channels_ == 1) {
				35	signal = input;
				36	signal_len = input_len;
				37	} else {
				38	// We want \|signal\| to be only the first channel of \|input\|, which is
				39	// interleaved. Thus, we take the first sample, skip forward \|num_channels\|
				40	// samples, and continue like that.
				41	signal_len = input_len / num_channels_;
				42	signal_array.reset(new int16_t[signal_len]);
				43	signal = signal_array.get();
				44	size_t j = master_channel_;
				45	for (size_t i = 0; i < signal_len; ++i) {
				46	signal_array[i] = input[j];
				47	j += num_channels_;
				48	}
				49	}
				50
				51	// Find maximum absolute value of input signal.
				52	max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);
				53
				54	// Downsample to 4 kHz sample rate and calculate auto-correlation.
				55	DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
				56	sample_rate_hz_, true /* compensate delay*/,
				57	downsampled_input_);
				58	AutoCorrelation();
				59
				60	// Find the strongest correlation peak.
				61	static const int kNumPeaks = 1;
				62	int peak_index;
				63	int16_t peak_value;
				64	DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
				65	fs_mult_, &peak_index, &peak_value);
				66	// Assert that \|peak_index\| stays within boundaries.
				67	assert(peak_index >= 0);
				68	assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);
				69
				70	// Compensate peak_index for displaced starting position. The displacement
				71	// happens in AutoCorrelation(). Here, \|kMinLag\| is in the down-sampled 4 kHz
				72	// domain, while the \|peak_index\| is in the original sample rate; hence, the
				73	// multiplication by fs_mult_ * 2.
				74	peak_index += kMinLag * fs_mult_ * 2;
				75	// Assert that \|peak_index\| stays within boundaries.
				76	assert(peak_index >= 20 * fs_mult_);
				77	assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);
				78
				79	// Calculate scaling to ensure that \|peak_index\| samples can be square-summed
				80	// without overflowing.
				81	int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
				82	WebRtcSpl_NormW32(peak_index);
				83	scaling = std::max(0, scaling);
				84
				85	// \|vec1\| starts at 15 ms minus one pitch period.
				86	const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
				87	// \|vec2\| start at 15 ms.
				88	const int16_t* vec2 = &signal[fs_mult_120];
				89	// Calculate energies for \|vec1\| and \|vec2\|, assuming they both contain
				90	// \|peak_index\| samples.
				91	int32_t vec1_energy =
				92	WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
				93	int32_t vec2_energy =
				94	WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);
				95
				96	// Calculate cross-correlation between \|vec1\| and \|vec2\|.
				97	int32_t cross_corr =
				98	WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);
				99
				100	// Check if the signal seems to be active speech or not (simple VAD).
				101	bool active_speech = SpeechDetection(vec1_energy, vec2_energy, peak_index,
				102	scaling);
				103
				104	int16_t best_correlation;
				105	if (!active_speech) {
				106	SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
				107	} else {
				108	// Calculate correlation:
				109	// cross_corr / sqrt(vec1_energy * vec2_energy).
				110
				111	// Start with calculating scale values.
				112	int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
				113	int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));
				114
				115	// Make sure total scaling is even (to simplify scale factor after sqrt).
				116	if ((energy1_scale + energy2_scale) & 1) {
				117	// The sum is odd.
				118	energy1_scale += 1;
				119	}
				120
				121	// Scale energies to int16_t.
				122	int16_t vec1_energy_int16 =
				123	static_cast<int16_t>(vec1_energy >> energy1_scale);
				124	int16_t vec2_energy_int16 =
				125	static_cast<int16_t>(vec2_energy >> energy2_scale);
				126
				127	// Calculate square-root of energy product.
				128	int16_t sqrt_energy_prod = WebRtcSpl_SqrtFloor(vec1_energy_int16 *
				129	vec2_energy_int16);
				130
				131	// Calculate cross_corr / sqrt(en1*en2) in Q14.
				132	int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
				133	cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
				134	cross_corr = std::max(0, cross_corr); // Don't use if negative.
				135	best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
				136	// Make sure \|best_correlation\| is no larger than 1 in Q14.
				137	best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
				138	}
				139
				140
				141	// Check accelerate criteria and stretch the signal.
				142	ReturnCodes return_value = CheckCriteriaAndStretch(input, input_len,
				143	peak_index,
				144	best_correlation,
				145	active_speech, output);
				146	switch (return_value) {
				147	case kSuccess:
				148	*length_change_samples = peak_index;
				149	break;
				150	case kSuccessLowEnergy:
				151	*length_change_samples = peak_index;
				152	break;
				153	case kNoStretch:
				154	case kError:
				155	*length_change_samples = 0;
				156	break;
				157	}
				158	return return_value;
				159	}
				160
				161	void TimeStretch::AutoCorrelation() {
				162	// Set scaling factor for cross correlation to protect against overflow.
				163	int scaling = kLogCorrelationLen - WebRtcSpl_NormW32(
				164	max_input_value_ * max_input_value_);
				165	scaling = std::max(0, scaling);
				166
				167	// Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
				168	int32_t auto_corr[kCorrelationLen];
				169	WebRtcSpl_CrossCorrelation(auto_corr, &downsampled_input_[kMaxLag],
				170	&downsampled_input_[kMaxLag - kMinLag],
				171	kCorrelationLen, kMaxLag - kMinLag, scaling, -1);
				172
				173	// Normalize correlation to 14 bits and write to \|auto_correlation_\|.
				174	int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
				175	scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
				176	WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
				177	auto_corr, scaling);
				178	}
				179
				180	bool TimeStretch::SpeechDetection(int32_t vec1_energy, int32_t vec2_energy,
				181	int peak_index, int scaling) const {
				182	// Check if the signal seems to be active speech or not (simple VAD).
				183	// If (vec1_energy + vec2_energy) / (2 * peak_index) <=
				184	// 8 * background_noise_energy, then we say that the signal contains no
				185	// active speech.
				186	// Rewrite the inequality as:
				187	// (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
				188	// The two sides of the inequality will be denoted \|left_side\| and
				189	// \|right_side\|.
				190	int32_t left_side = (vec1_energy + vec2_energy) / 16;
				191	int32_t right_side;
				192	if (background_noise_.initialized()) {
				193	right_side = background_noise_.Energy(master_channel_);
				194	} else {
				195	// If noise parameters have not been estimated, use a fixed threshold.
				196	right_side = 75000;
				197	}
				198	int right_scale = 16 - WebRtcSpl_NormW32(right_side);
				199	right_scale = std::max(0, right_scale);
				200	left_side = left_side >> right_scale;
				201	right_side = peak_index * (right_side >> right_scale);
				202
				203	// Scale \|left_side\| properly before comparing with \|right_side\|.
				204	// (\|scaling\| is the scale factor before energy calculation, thus the scale
				205	// factor for the energy is 2 * scaling.)
				206	if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
				207	// Cannot scale only \|left_side\|, must scale \|right_side\| too.
				208	int temp_scale = WebRtcSpl_NormW32(left_side);
				209	left_side = left_side << temp_scale;
				210	right_side = right_side >> (2 * scaling - temp_scale);
				211	} else {
				212	left_side = left_side << 2 * scaling;
				213	}
				214	return left_side > right_side;
				215	}
				216
				217	} // namespace webrtc