blob: 7b63ac3241256eeb999e599c6fba3ff2ea75a8f5 [file] [log] [blame]
henrik.lundin@webrtc.org9a400812013-01-29 12:09:21 +00001/*
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "webrtc/modules/audio_coding/neteq4/time_stretch.h"
12
13#include <algorithm> // min, max
14
15#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
16#include "webrtc/modules/audio_coding/neteq4/background_noise.h"
17#include "webrtc/modules/audio_coding/neteq4/dsp_helper.h"
18#include "webrtc/system_wrappers/interface/scoped_ptr.h"
19
20namespace webrtc {
21
22TimeStretch::ReturnCodes TimeStretch::Process(
23 const int16_t* input,
24 size_t input_len,
25 AudioMultiVector<int16_t>* output,
26 int16_t* length_change_samples) {
27
28 // Pre-calculate common multiplication with |fs_mult_|.
29 int fs_mult_120 = fs_mult_ * 120; // Corresponds to 15 ms.
30
31 const int16_t* signal;
32 scoped_array<int16_t> signal_array;
33 size_t signal_len;
34 if (num_channels_ == 1) {
35 signal = input;
36 signal_len = input_len;
37 } else {
38 // We want |signal| to be only the first channel of |input|, which is
39 // interleaved. Thus, we take the first sample, skip forward |num_channels|
40 // samples, and continue like that.
41 signal_len = input_len / num_channels_;
42 signal_array.reset(new int16_t[signal_len]);
43 signal = signal_array.get();
44 size_t j = master_channel_;
45 for (size_t i = 0; i < signal_len; ++i) {
46 signal_array[i] = input[j];
47 j += num_channels_;
48 }
49 }
50
51 // Find maximum absolute value of input signal.
52 max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);
53
54 // Downsample to 4 kHz sample rate and calculate auto-correlation.
55 DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
56 sample_rate_hz_, true /* compensate delay*/,
57 downsampled_input_);
58 AutoCorrelation();
59
60 // Find the strongest correlation peak.
61 static const int kNumPeaks = 1;
62 int peak_index;
63 int16_t peak_value;
64 DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
65 fs_mult_, &peak_index, &peak_value);
66 // Assert that |peak_index| stays within boundaries.
67 assert(peak_index >= 0);
68 assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);
69
70 // Compensate peak_index for displaced starting position. The displacement
71 // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz
72 // domain, while the |peak_index| is in the original sample rate; hence, the
73 // multiplication by fs_mult_ * 2.
74 peak_index += kMinLag * fs_mult_ * 2;
75 // Assert that |peak_index| stays within boundaries.
76 assert(peak_index >= 20 * fs_mult_);
77 assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);
78
79 // Calculate scaling to ensure that |peak_index| samples can be square-summed
80 // without overflowing.
81 int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
82 WebRtcSpl_NormW32(peak_index);
83 scaling = std::max(0, scaling);
84
85 // |vec1| starts at 15 ms minus one pitch period.
86 const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
87 // |vec2| start at 15 ms.
88 const int16_t* vec2 = &signal[fs_mult_120];
89 // Calculate energies for |vec1| and |vec2|, assuming they both contain
90 // |peak_index| samples.
91 int32_t vec1_energy =
92 WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
93 int32_t vec2_energy =
94 WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);
95
96 // Calculate cross-correlation between |vec1| and |vec2|.
97 int32_t cross_corr =
98 WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);
99
100 // Check if the signal seems to be active speech or not (simple VAD).
101 bool active_speech = SpeechDetection(vec1_energy, vec2_energy, peak_index,
102 scaling);
103
104 int16_t best_correlation;
105 if (!active_speech) {
106 SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
107 } else {
108 // Calculate correlation:
109 // cross_corr / sqrt(vec1_energy * vec2_energy).
110
111 // Start with calculating scale values.
112 int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
113 int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));
114
115 // Make sure total scaling is even (to simplify scale factor after sqrt).
116 if ((energy1_scale + energy2_scale) & 1) {
117 // The sum is odd.
118 energy1_scale += 1;
119 }
120
121 // Scale energies to int16_t.
122 int16_t vec1_energy_int16 =
123 static_cast<int16_t>(vec1_energy >> energy1_scale);
124 int16_t vec2_energy_int16 =
125 static_cast<int16_t>(vec2_energy >> energy2_scale);
126
127 // Calculate square-root of energy product.
128 int16_t sqrt_energy_prod = WebRtcSpl_SqrtFloor(vec1_energy_int16 *
129 vec2_energy_int16);
130
131 // Calculate cross_corr / sqrt(en1*en2) in Q14.
132 int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
133 cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
134 cross_corr = std::max(0, cross_corr); // Don't use if negative.
135 best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
136 // Make sure |best_correlation| is no larger than 1 in Q14.
137 best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
138 }
139
140
141 // Check accelerate criteria and stretch the signal.
142 ReturnCodes return_value = CheckCriteriaAndStretch(input, input_len,
143 peak_index,
144 best_correlation,
145 active_speech, output);
146 switch (return_value) {
147 case kSuccess:
148 *length_change_samples = peak_index;
149 break;
150 case kSuccessLowEnergy:
151 *length_change_samples = peak_index;
152 break;
153 case kNoStretch:
154 case kError:
155 *length_change_samples = 0;
156 break;
157 }
158 return return_value;
159}
160
161void TimeStretch::AutoCorrelation() {
162 // Set scaling factor for cross correlation to protect against overflow.
163 int scaling = kLogCorrelationLen - WebRtcSpl_NormW32(
164 max_input_value_ * max_input_value_);
165 scaling = std::max(0, scaling);
166
167 // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
168 int32_t auto_corr[kCorrelationLen];
169 WebRtcSpl_CrossCorrelation(auto_corr, &downsampled_input_[kMaxLag],
170 &downsampled_input_[kMaxLag - kMinLag],
171 kCorrelationLen, kMaxLag - kMinLag, scaling, -1);
172
173 // Normalize correlation to 14 bits and write to |auto_correlation_|.
174 int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
175 scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
176 WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
177 auto_corr, scaling);
178}
179
180bool TimeStretch::SpeechDetection(int32_t vec1_energy, int32_t vec2_energy,
181 int peak_index, int scaling) const {
182 // Check if the signal seems to be active speech or not (simple VAD).
183 // If (vec1_energy + vec2_energy) / (2 * peak_index) <=
184 // 8 * background_noise_energy, then we say that the signal contains no
185 // active speech.
186 // Rewrite the inequality as:
187 // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
188 // The two sides of the inequality will be denoted |left_side| and
189 // |right_side|.
190 int32_t left_side = (vec1_energy + vec2_energy) / 16;
191 int32_t right_side;
192 if (background_noise_.initialized()) {
193 right_side = background_noise_.Energy(master_channel_);
194 } else {
195 // If noise parameters have not been estimated, use a fixed threshold.
196 right_side = 75000;
197 }
198 int right_scale = 16 - WebRtcSpl_NormW32(right_side);
199 right_scale = std::max(0, right_scale);
200 left_side = left_side >> right_scale;
201 right_side = peak_index * (right_side >> right_scale);
202
203 // Scale |left_side| properly before comparing with |right_side|.
204 // (|scaling| is the scale factor before energy calculation, thus the scale
205 // factor for the energy is 2 * scaling.)
206 if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
207 // Cannot scale only |left_side|, must scale |right_side| too.
208 int temp_scale = WebRtcSpl_NormW32(left_side);
209 left_side = left_side << temp_scale;
210 right_side = right_side >> (2 * scaling - temp_scale);
211 } else {
212 left_side = left_side << 2 * scaling;
213 }
214 return left_side > right_side;
215}
216
217} // namespace webrtc