Add private voice detection instance to replace public voice detector
This adds a second (!) VoiceDetection instance in APM, activated via webrtc::AudioProcessing::Config and which reports its values in the webrtc::AudioProcessingStats struct.
The alternative is to reuse the existing instance, but that would require adding a proxy interface returned by AudioProcessing::voice_detection() to update the internal config of AudioProcessingImpl when calling voice_detection()->Enable().
Complexity-wise, no reasonable client will enable both interfaces simultaneously, so the footprint is negligible.
Bug: webrtc:9947
Change-Id: I7d8e28b9bf06abab8f9c6822424bdb9d803b987d
Reviewed-on: https://webrtc-review.googlesource.com/c/115243
Commit-Queue: Sam Zackrisson <saza@webrtc.org>
Reviewed-by: Ivo Creusen <ivoc@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#26101}
diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc
index 2937c06..c0058c7 100644
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@@ -140,6 +140,7 @@
bool pre_amplifier_enabled,
bool echo_controller_enabled,
bool voice_activity_detector_enabled,
+ bool private_voice_detector_enabled,
bool level_estimator_enabled,
bool transient_suppressor_enabled) {
bool changed = false;
@@ -159,6 +160,8 @@
changed |= (level_estimator_enabled != level_estimator_enabled_);
changed |=
(voice_activity_detector_enabled != voice_activity_detector_enabled_);
+ changed |=
+ (private_voice_detector_enabled != private_voice_detector_enabled_);
changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
if (changed) {
high_pass_filter_enabled_ = high_pass_filter_enabled;
@@ -172,6 +175,7 @@
echo_controller_enabled_ = echo_controller_enabled;
level_estimator_enabled_ = level_estimator_enabled;
voice_activity_detector_enabled_ = voice_activity_detector_enabled;
+ private_voice_detector_enabled_ = private_voice_detector_enabled;
transient_suppressor_enabled_ = transient_suppressor_enabled;
}
@@ -182,7 +186,8 @@
bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandSubModulesActive()
const {
- return CaptureMultiBandProcessingActive() || voice_activity_detector_enabled_;
+ return CaptureMultiBandProcessingActive() ||
+ voice_activity_detector_enabled_ || private_voice_detector_enabled_;
}
bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandProcessingActive()
@@ -260,6 +265,7 @@
std::unique_ptr<GainApplier> pre_amplifier;
std::unique_ptr<CustomAudioAnalyzer> capture_analyzer;
std::unique_ptr<LevelEstimatorImpl> output_level_estimator;
+ std::unique_ptr<VoiceDetectionImpl> voice_detector;
};
AudioProcessingBuilder::AudioProcessingBuilder() = default;
@@ -540,6 +546,10 @@
public_submodules_->noise_suppression->Initialize(num_proc_channels(),
proc_sample_rate_hz());
public_submodules_->voice_detection->Initialize(proc_split_sample_rate_hz());
+ if (private_submodules_->voice_detector) {
+ private_submodules_->voice_detector->Initialize(
+ proc_split_sample_rate_hz());
+ }
public_submodules_->level_estimator->Initialize();
InitializeResidualEchoDetector();
InitializeEchoController();
@@ -681,6 +691,16 @@
new LevelEstimatorImpl(&crit_capture_));
private_submodules_->output_level_estimator->Enable(true);
}
+
+ if (config_.voice_detection.enabled && !private_submodules_->voice_detector) {
+ private_submodules_->voice_detector.reset(
+ new VoiceDetectionImpl(&crit_capture_));
+ private_submodules_->voice_detector->Enable(true);
+ private_submodules_->voice_detector->set_likelihood(
+ VoiceDetection::kVeryLowLikelihood);
+ private_submodules_->voice_detector->Initialize(
+ proc_split_sample_rate_hz());
+ }
}
void AudioProcessingImpl::SetExtraOptions(const webrtc::Config& config) {
@@ -1285,6 +1305,13 @@
}
public_submodules_->voice_detection->ProcessCaptureAudio(capture_buffer);
+ if (config_.voice_detection.enabled) {
+ private_submodules_->voice_detector->ProcessCaptureAudio(capture_buffer);
+ capture_.stats.voice_detected =
+ private_submodules_->voice_detector->stream_has_voice();
+ } else {
+ capture_.stats.voice_detected = absl::nullopt;
+ }
if (constants_.use_experimental_agc &&
public_submodules_->gain_control->is_enabled() &&
@@ -1695,6 +1722,7 @@
config_.gain_controller2.enabled, config_.pre_amplifier.enabled,
capture_nonlocked_.echo_controller_enabled,
public_submodules_->voice_detection->is_enabled(),
+ config_.voice_detection.enabled,
public_submodules_->level_estimator->is_enabled(),
capture_.transient_suppressor_enabled);
}
diff --git a/modules/audio_processing/audio_processing_impl.h b/modules/audio_processing/audio_processing_impl.h
index 2f946c5..815cc95 100644
--- a/modules/audio_processing/audio_processing_impl.h
+++ b/modules/audio_processing/audio_processing_impl.h
@@ -181,6 +181,7 @@
bool pre_amplifier_enabled,
bool echo_controller_enabled,
bool voice_activity_detector_enabled,
+ bool private_voice_detector_enabled,
bool level_estimator_enabled,
bool transient_suppressor_enabled);
bool CaptureMultiBandSubModulesActive() const;
@@ -207,6 +208,7 @@
bool echo_controller_enabled_ = false;
bool level_estimator_enabled_ = false;
bool voice_activity_detector_enabled_ = false;
+ bool private_voice_detector_enabled_ = false;
bool transient_suppressor_enabled_ = false;
bool first_update_ = true;
};
diff --git a/modules/audio_processing/audio_processing_unittest.cc b/modules/audio_processing/audio_processing_unittest.cc
index d01333a..5bd2fae 100644
--- a/modules/audio_processing/audio_processing_unittest.cc
+++ b/modules/audio_processing/audio_processing_unittest.cc
@@ -2696,7 +2696,7 @@
// Set up an audioframe.
AudioFrame frame;
frame.num_channels_ = 1;
- SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz);
+ SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
// Fill the audio frame with a sawtooth pattern.
int16_t* ptr = frame.mutable_data();
@@ -2755,7 +2755,7 @@
// Set up an audioframe.
AudioFrame frame;
frame.num_channels_ = 1;
- SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz);
+ SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
// Fill the audio frame with a sawtooth pattern.
int16_t* ptr = frame.mutable_data();
@@ -2809,7 +2809,7 @@
// Set up an audioframe.
AudioFrame frame;
frame.num_channels_ = 1;
- SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate48kHz);
+ SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
// Fill the audio frame with a sawtooth pattern.
int16_t* ptr = frame.mutable_data();
@@ -2838,4 +2838,41 @@
EXPECT_EQ(apm->ProcessStream(&frame), 0);
EXPECT_FALSE(apm->GetStatistics(false).output_rms_dbfs);
}
+
+TEST(ApmStatistics, ReportHasVoice) {
+ ProcessingConfig processing_config = {
+ {{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}};
+ AudioProcessing::Config config;
+
+ // Set up an audioframe.
+ AudioFrame frame;
+ frame.num_channels_ = 1;
+ SetFrameSampleRate(&frame, AudioProcessing::NativeRate::kSampleRate32kHz);
+
+ // Fill the audio frame with a sawtooth pattern.
+ int16_t* ptr = frame.mutable_data();
+ for (size_t i = 0; i < frame.kMaxDataSizeSamples; i++) {
+ ptr[i] = 10000 * ((i % 3) - 1);
+ }
+
+ std::unique_ptr<AudioProcessing> apm(AudioProcessingBuilder().Create());
+ apm->Initialize(processing_config);
+
+ // If not enabled, no metric should be reported.
+ EXPECT_EQ(apm->ProcessStream(&frame), 0);
+ EXPECT_FALSE(apm->GetStatistics(false).voice_detected);
+
+ // If enabled, metrics should be reported.
+ config.voice_detection.enabled = true;
+ apm->ApplyConfig(config);
+ EXPECT_EQ(apm->ProcessStream(&frame), 0);
+ auto stats = apm->GetStatistics(false);
+ EXPECT_TRUE(stats.voice_detected);
+
+ // If re-disabled, the value is again not reported.
+ config.voice_detection.enabled = false;
+ apm->ApplyConfig(config);
+ EXPECT_EQ(apm->ProcessStream(&frame), 0);
+ EXPECT_FALSE(apm->GetStatistics(false).voice_detected);
+}
} // namespace webrtc
diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h
index df51313..429816b 100644
--- a/modules/audio_processing/include/audio_processing.h
+++ b/modules/audio_processing/include/audio_processing.h
@@ -288,6 +288,11 @@
bool enabled = false;
} level_estimation;
+ // Enables reporting of |has_voice| in webrtc::AudioProcessingStats.
+ struct VoiceDetection {
+ bool enabled = false;
+ } voice_detection;
+
// Explicit copy assignment implementation to avoid issues with memory
// sanitizer complaints in case of self-assignment.
// TODO(peah): Add buildflag to ensure that this is only included for memory
diff --git a/modules/audio_processing/include/audio_processing_statistics.h b/modules/audio_processing/include/audio_processing_statistics.h
index 683db05..87babee 100644
--- a/modules/audio_processing/include/audio_processing_statistics.h
+++ b/modules/audio_processing/include/audio_processing_statistics.h
@@ -32,6 +32,12 @@
// Only reported if level estimation is enabled in AudioProcessing::Config.
absl::optional<int> output_rms_dbfs;
+ // True if voice is detected in the last capture frame, after processing.
+ // It is conservative in flagging audio as speech, with low likelihood of
+ // incorrectly flagging a frame as voice.
+ // Only reported if voice detection is enabled in AudioProcessing::Config.
+ absl::optional<bool> voice_detected;
+
// AEC Statistics.
// ERL = 10log_10(P_far / P_echo)
absl::optional<double> echo_return_loss;