henrik.lundin@webrtc.org | 9a40081 | 2013-01-29 12:09:21 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license |
| 5 | * that can be found in the LICENSE file in the root of the source |
| 6 | * tree. An additional intellectual property rights grant can be found |
| 7 | * in the file PATENTS. All contributing project authors may |
| 8 | * be found in the AUTHORS file in the root of the source tree. |
| 9 | */ |
| 10 | |
| 11 | #include "webrtc/modules/audio_coding/neteq4/time_stretch.h" |
| 12 | |
| 13 | #include <algorithm> // min, max |
| 14 | |
| 15 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" |
| 16 | #include "webrtc/modules/audio_coding/neteq4/background_noise.h" |
| 17 | #include "webrtc/modules/audio_coding/neteq4/dsp_helper.h" |
| 18 | #include "webrtc/system_wrappers/interface/scoped_ptr.h" |
| 19 | |
| 20 | namespace webrtc { |
| 21 | |
| 22 | TimeStretch::ReturnCodes TimeStretch::Process( |
| 23 | const int16_t* input, |
| 24 | size_t input_len, |
| 25 | AudioMultiVector<int16_t>* output, |
| 26 | int16_t* length_change_samples) { |
| 27 | |
| 28 | // Pre-calculate common multiplication with |fs_mult_|. |
| 29 | int fs_mult_120 = fs_mult_ * 120; // Corresponds to 15 ms. |
| 30 | |
| 31 | const int16_t* signal; |
| 32 | scoped_array<int16_t> signal_array; |
| 33 | size_t signal_len; |
| 34 | if (num_channels_ == 1) { |
| 35 | signal = input; |
| 36 | signal_len = input_len; |
| 37 | } else { |
| 38 | // We want |signal| to be only the first channel of |input|, which is |
| 39 | // interleaved. Thus, we take the first sample, skip forward |num_channels| |
| 40 | // samples, and continue like that. |
| 41 | signal_len = input_len / num_channels_; |
| 42 | signal_array.reset(new int16_t[signal_len]); |
| 43 | signal = signal_array.get(); |
| 44 | size_t j = master_channel_; |
| 45 | for (size_t i = 0; i < signal_len; ++i) { |
| 46 | signal_array[i] = input[j]; |
| 47 | j += num_channels_; |
| 48 | } |
| 49 | } |
| 50 | |
| 51 | // Find maximum absolute value of input signal. |
| 52 | max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len); |
| 53 | |
| 54 | // Downsample to 4 kHz sample rate and calculate auto-correlation. |
| 55 | DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen, |
| 56 | sample_rate_hz_, true /* compensate delay*/, |
| 57 | downsampled_input_); |
| 58 | AutoCorrelation(); |
| 59 | |
| 60 | // Find the strongest correlation peak. |
| 61 | static const int kNumPeaks = 1; |
| 62 | int peak_index; |
| 63 | int16_t peak_value; |
| 64 | DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks, |
| 65 | fs_mult_, &peak_index, &peak_value); |
| 66 | // Assert that |peak_index| stays within boundaries. |
| 67 | assert(peak_index >= 0); |
| 68 | assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_); |
| 69 | |
| 70 | // Compensate peak_index for displaced starting position. The displacement |
| 71 | // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz |
| 72 | // domain, while the |peak_index| is in the original sample rate; hence, the |
| 73 | // multiplication by fs_mult_ * 2. |
| 74 | peak_index += kMinLag * fs_mult_ * 2; |
| 75 | // Assert that |peak_index| stays within boundaries. |
| 76 | assert(peak_index >= 20 * fs_mult_); |
| 77 | assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_); |
| 78 | |
| 79 | // Calculate scaling to ensure that |peak_index| samples can be square-summed |
| 80 | // without overflowing. |
| 81 | int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) - |
| 82 | WebRtcSpl_NormW32(peak_index); |
| 83 | scaling = std::max(0, scaling); |
| 84 | |
| 85 | // |vec1| starts at 15 ms minus one pitch period. |
| 86 | const int16_t* vec1 = &signal[fs_mult_120 - peak_index]; |
| 87 | // |vec2| start at 15 ms. |
| 88 | const int16_t* vec2 = &signal[fs_mult_120]; |
| 89 | // Calculate energies for |vec1| and |vec2|, assuming they both contain |
| 90 | // |peak_index| samples. |
| 91 | int32_t vec1_energy = |
| 92 | WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling); |
| 93 | int32_t vec2_energy = |
| 94 | WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling); |
| 95 | |
| 96 | // Calculate cross-correlation between |vec1| and |vec2|. |
| 97 | int32_t cross_corr = |
| 98 | WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling); |
| 99 | |
| 100 | // Check if the signal seems to be active speech or not (simple VAD). |
| 101 | bool active_speech = SpeechDetection(vec1_energy, vec2_energy, peak_index, |
| 102 | scaling); |
| 103 | |
| 104 | int16_t best_correlation; |
| 105 | if (!active_speech) { |
| 106 | SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index); |
| 107 | } else { |
| 108 | // Calculate correlation: |
| 109 | // cross_corr / sqrt(vec1_energy * vec2_energy). |
| 110 | |
| 111 | // Start with calculating scale values. |
| 112 | int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy)); |
| 113 | int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy)); |
| 114 | |
| 115 | // Make sure total scaling is even (to simplify scale factor after sqrt). |
| 116 | if ((energy1_scale + energy2_scale) & 1) { |
| 117 | // The sum is odd. |
| 118 | energy1_scale += 1; |
| 119 | } |
| 120 | |
| 121 | // Scale energies to int16_t. |
| 122 | int16_t vec1_energy_int16 = |
| 123 | static_cast<int16_t>(vec1_energy >> energy1_scale); |
| 124 | int16_t vec2_energy_int16 = |
| 125 | static_cast<int16_t>(vec2_energy >> energy2_scale); |
| 126 | |
| 127 | // Calculate square-root of energy product. |
| 128 | int16_t sqrt_energy_prod = WebRtcSpl_SqrtFloor(vec1_energy_int16 * |
| 129 | vec2_energy_int16); |
| 130 | |
| 131 | // Calculate cross_corr / sqrt(en1*en2) in Q14. |
| 132 | int temp_scale = 14 - (energy1_scale + energy2_scale) / 2; |
| 133 | cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale); |
| 134 | cross_corr = std::max(0, cross_corr); // Don't use if negative. |
| 135 | best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod); |
| 136 | // Make sure |best_correlation| is no larger than 1 in Q14. |
| 137 | best_correlation = std::min(static_cast<int16_t>(16384), best_correlation); |
| 138 | } |
| 139 | |
| 140 | |
| 141 | // Check accelerate criteria and stretch the signal. |
| 142 | ReturnCodes return_value = CheckCriteriaAndStretch(input, input_len, |
| 143 | peak_index, |
| 144 | best_correlation, |
| 145 | active_speech, output); |
| 146 | switch (return_value) { |
| 147 | case kSuccess: |
| 148 | *length_change_samples = peak_index; |
| 149 | break; |
| 150 | case kSuccessLowEnergy: |
| 151 | *length_change_samples = peak_index; |
| 152 | break; |
| 153 | case kNoStretch: |
| 154 | case kError: |
| 155 | *length_change_samples = 0; |
| 156 | break; |
| 157 | } |
| 158 | return return_value; |
| 159 | } |
| 160 | |
| 161 | void TimeStretch::AutoCorrelation() { |
| 162 | // Set scaling factor for cross correlation to protect against overflow. |
| 163 | int scaling = kLogCorrelationLen - WebRtcSpl_NormW32( |
| 164 | max_input_value_ * max_input_value_); |
| 165 | scaling = std::max(0, scaling); |
| 166 | |
| 167 | // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain. |
| 168 | int32_t auto_corr[kCorrelationLen]; |
| 169 | WebRtcSpl_CrossCorrelation(auto_corr, &downsampled_input_[kMaxLag], |
| 170 | &downsampled_input_[kMaxLag - kMinLag], |
| 171 | kCorrelationLen, kMaxLag - kMinLag, scaling, -1); |
| 172 | |
| 173 | // Normalize correlation to 14 bits and write to |auto_correlation_|. |
| 174 | int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen); |
| 175 | scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr)); |
| 176 | WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen, |
| 177 | auto_corr, scaling); |
| 178 | } |
| 179 | |
| 180 | bool TimeStretch::SpeechDetection(int32_t vec1_energy, int32_t vec2_energy, |
| 181 | int peak_index, int scaling) const { |
| 182 | // Check if the signal seems to be active speech or not (simple VAD). |
| 183 | // If (vec1_energy + vec2_energy) / (2 * peak_index) <= |
| 184 | // 8 * background_noise_energy, then we say that the signal contains no |
| 185 | // active speech. |
| 186 | // Rewrite the inequality as: |
| 187 | // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy. |
| 188 | // The two sides of the inequality will be denoted |left_side| and |
| 189 | // |right_side|. |
| 190 | int32_t left_side = (vec1_energy + vec2_energy) / 16; |
| 191 | int32_t right_side; |
| 192 | if (background_noise_.initialized()) { |
| 193 | right_side = background_noise_.Energy(master_channel_); |
| 194 | } else { |
| 195 | // If noise parameters have not been estimated, use a fixed threshold. |
| 196 | right_side = 75000; |
| 197 | } |
| 198 | int right_scale = 16 - WebRtcSpl_NormW32(right_side); |
| 199 | right_scale = std::max(0, right_scale); |
| 200 | left_side = left_side >> right_scale; |
| 201 | right_side = peak_index * (right_side >> right_scale); |
| 202 | |
| 203 | // Scale |left_side| properly before comparing with |right_side|. |
| 204 | // (|scaling| is the scale factor before energy calculation, thus the scale |
| 205 | // factor for the energy is 2 * scaling.) |
| 206 | if (WebRtcSpl_NormW32(left_side) < 2 * scaling) { |
| 207 | // Cannot scale only |left_side|, must scale |right_side| too. |
| 208 | int temp_scale = WebRtcSpl_NormW32(left_side); |
| 209 | left_side = left_side << temp_scale; |
| 210 | right_side = right_side >> (2 * scaling - temp_scale); |
| 211 | } else { |
| 212 | left_side = left_side << 2 * scaling; |
| 213 | } |
| 214 | return left_side > right_side; |
| 215 | } |
| 216 | |
| 217 | } // namespace webrtc |