Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 1 | // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "content/browser/speech/speech_recognizer_impl.h" |
| 6 | |
| 7 | #include "base/basictypes.h" |
| 8 | #include "base/bind.h" |
Ben Murdoch | eb525c5 | 2013-07-10 11:40:50 +0100 | [diff] [blame] | 9 | #include "base/time/time.h" |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 10 | #include "content/browser/browser_main_loop.h" |
| 11 | #include "content/browser/speech/audio_buffer.h" |
| 12 | #include "content/browser/speech/google_one_shot_remote_engine.h" |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 13 | #include "content/public/browser/speech_recognition_event_listener.h" |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 14 | #include "media/base/audio_converter.h" |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 15 | #include "net/url_request/url_request_context_getter.h" |
| 16 | |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 17 | #if defined(OS_WIN) |
| 18 | #include "media/audio/win/core_audio_util_win.h" |
| 19 | #endif |
| 20 | |
| 21 | using media::AudioBus; |
| 22 | using media::AudioConverter; |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 23 | using media::AudioInputController; |
| 24 | using media::AudioManager; |
| 25 | using media::AudioParameters; |
| 26 | using media::ChannelLayout; |
| 27 | |
| 28 | namespace content { |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 29 | |
| 30 | // Private class which encapsulates the audio converter and the |
| 31 | // AudioConverter::InputCallback. It handles resampling, buffering and |
| 32 | // channel mixing between input and output parameters. |
| 33 | class SpeechRecognizerImpl::OnDataConverter |
| 34 | : public media::AudioConverter::InputCallback { |
| 35 | public: |
| 36 | OnDataConverter(const AudioParameters& input_params, |
| 37 | const AudioParameters& output_params); |
| 38 | virtual ~OnDataConverter(); |
| 39 | |
| 40 | // Converts input |data| buffer into an AudioChunk where the input format |
| 41 | // is given by |input_parameters_| and the output format by |
| 42 | // |output_parameters_|. |
| 43 | scoped_refptr<AudioChunk> Convert(const uint8* data, size_t size); |
| 44 | |
| 45 | private: |
| 46 | // media::AudioConverter::InputCallback implementation. |
| 47 | virtual double ProvideInput(AudioBus* dest, |
| 48 | base::TimeDelta buffer_delay) OVERRIDE; |
| 49 | |
| 50 | // Handles resampling, buffering, and channel mixing between input and output |
| 51 | // parameters. |
| 52 | AudioConverter audio_converter_; |
| 53 | |
| 54 | scoped_ptr<AudioBus> input_bus_; |
| 55 | scoped_ptr<AudioBus> output_bus_; |
| 56 | const AudioParameters input_parameters_; |
| 57 | const AudioParameters output_parameters_; |
| 58 | bool waiting_for_input_; |
| 59 | scoped_ptr<uint8[]> converted_data_; |
| 60 | |
| 61 | DISALLOW_COPY_AND_ASSIGN(OnDataConverter); |
| 62 | }; |
| 63 | |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 64 | namespace { |
| 65 | |
| 66 | // The following constants are related to the volume level indicator shown in |
| 67 | // the UI for recorded audio. |
| 68 | // Multiplier used when new volume is greater than previous level. |
| 69 | const float kUpSmoothingFactor = 1.0f; |
| 70 | // Multiplier used when new volume is lesser than previous level. |
| 71 | const float kDownSmoothingFactor = 0.7f; |
| 72 | // RMS dB value of a maximum (unclipped) sine wave for int16 samples. |
| 73 | const float kAudioMeterMaxDb = 90.31f; |
| 74 | // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. |
| 75 | // Values lower than this will display as empty level-meter. |
| 76 | const float kAudioMeterMinDb = 30.0f; |
| 77 | const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; |
| 78 | |
| 79 | // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) |
| 80 | const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; |
| 81 | |
| 82 | // Returns true if more than 5% of the samples are at min or max value. |
| 83 | bool DetectClipping(const AudioChunk& chunk) { |
| 84 | const int num_samples = chunk.NumSamples(); |
| 85 | const int16* samples = chunk.SamplesData16(); |
| 86 | const int kThreshold = num_samples / 20; |
| 87 | int clipping_samples = 0; |
| 88 | |
| 89 | for (int i = 0; i < num_samples; ++i) { |
| 90 | if (samples[i] <= -32767 || samples[i] >= 32767) { |
| 91 | if (++clipping_samples > kThreshold) |
| 92 | return true; |
| 93 | } |
| 94 | } |
| 95 | return false; |
| 96 | } |
| 97 | |
| 98 | void KeepAudioControllerRefcountedForDtor(scoped_refptr<AudioInputController>) { |
| 99 | } |
| 100 | |
| 101 | } // namespace |
| 102 | |
| 103 | const int SpeechRecognizerImpl::kAudioSampleRate = 16000; |
| 104 | const ChannelLayout SpeechRecognizerImpl::kChannelLayout = |
| 105 | media::CHANNEL_LAYOUT_MONO; |
| 106 | const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; |
| 107 | const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; |
| 108 | const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; |
| 109 | media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL; |
| 110 | |
| 111 | COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, |
| 112 | kNumBitsPerAudioSample_must_be_a_multiple_of_8); |
| 113 | |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 114 | // SpeechRecognizerImpl::OnDataConverter implementation |
| 115 | |
| 116 | SpeechRecognizerImpl::OnDataConverter::OnDataConverter( |
| 117 | const AudioParameters& input_params, const AudioParameters& output_params) |
| 118 | : audio_converter_(input_params, output_params, false), |
| 119 | input_bus_(AudioBus::Create(input_params)), |
| 120 | output_bus_(AudioBus::Create(output_params)), |
| 121 | input_parameters_(input_params), |
| 122 | output_parameters_(output_params), |
| 123 | waiting_for_input_(false), |
| 124 | converted_data_(new uint8[output_parameters_.GetBytesPerBuffer()]) { |
| 125 | audio_converter_.AddInput(this); |
| 126 | } |
| 127 | |
| 128 | SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { |
| 129 | // It should now be safe to unregister the converter since no more OnData() |
| 130 | // callbacks are outstanding at this point. |
| 131 | audio_converter_.RemoveInput(this); |
| 132 | } |
| 133 | |
| 134 | scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( |
| 135 | const uint8* data, size_t size) { |
| 136 | CHECK_EQ(size, static_cast<size_t>(input_parameters_.GetBytesPerBuffer())); |
| 137 | |
| 138 | input_bus_->FromInterleaved( |
| 139 | data, input_bus_->frames(), input_parameters_.bits_per_sample() / 8); |
| 140 | |
| 141 | waiting_for_input_ = true; |
| 142 | audio_converter_.Convert(output_bus_.get()); |
| 143 | |
| 144 | output_bus_->ToInterleaved( |
| 145 | output_bus_->frames(), output_parameters_.bits_per_sample() / 8, |
| 146 | converted_data_.get()); |
| 147 | |
| 148 | // TODO(primiano): Refactor AudioChunk to avoid the extra-copy here |
| 149 | // (see http://crbug.com/249316 for details). |
| 150 | return scoped_refptr<AudioChunk>(new AudioChunk( |
| 151 | converted_data_.get(), |
| 152 | output_parameters_.GetBytesPerBuffer(), |
| 153 | output_parameters_.bits_per_sample() / 8)); |
| 154 | } |
| 155 | |
| 156 | double SpeechRecognizerImpl::OnDataConverter::ProvideInput( |
| 157 | AudioBus* dest, base::TimeDelta buffer_delay) { |
| 158 | // The audio converted should never ask for more than one bus in each call |
| 159 | // to Convert(). If so, we have a serious issue in our design since we might |
| 160 | // miss recorded chunks of 100 ms audio data. |
| 161 | CHECK(waiting_for_input_); |
| 162 | |
| 163 | // Read from the input bus to feed the converter. |
| 164 | input_bus_->CopyTo(dest); |
| 165 | |
| 166 | // |input_bus_| should only be provide once. |
| 167 | waiting_for_input_ = false; |
| 168 | return 1; |
| 169 | } |
| 170 | |
| 171 | // SpeechRecognizerImpl implementation |
| 172 | |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 173 | SpeechRecognizerImpl::SpeechRecognizerImpl( |
| 174 | SpeechRecognitionEventListener* listener, |
| 175 | int session_id, |
| 176 | bool is_single_shot, |
| 177 | SpeechRecognitionEngine* engine) |
| 178 | : SpeechRecognizer(listener, session_id), |
| 179 | recognition_engine_(engine), |
| 180 | endpointer_(kAudioSampleRate), |
| 181 | is_dispatching_event_(false), |
| 182 | is_single_shot_(is_single_shot), |
| 183 | state_(STATE_IDLE) { |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 184 | DCHECK(recognition_engine_ != NULL); |
| 185 | if (is_single_shot) { |
| 186 | // In single shot recognition, the session is automatically ended after: |
| 187 | // - 0.5 seconds of silence if time < 3 seconds |
| 188 | // - 1 seconds of silence if time >= 3 seconds |
| 189 | endpointer_.set_speech_input_complete_silence_length( |
| 190 | base::Time::kMicrosecondsPerSecond / 2); |
| 191 | endpointer_.set_long_speech_input_complete_silence_length( |
| 192 | base::Time::kMicrosecondsPerSecond); |
| 193 | endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); |
| 194 | } else { |
| 195 | // In continuous recognition, the session is automatically ended after 15 |
| 196 | // seconds of silence. |
| 197 | const int64 cont_timeout_us = base::Time::kMicrosecondsPerSecond * 15; |
| 198 | endpointer_.set_speech_input_complete_silence_length(cont_timeout_us); |
| 199 | endpointer_.set_long_speech_length(0); // Use only a single timeout. |
| 200 | } |
| 201 | endpointer_.StartSession(); |
| 202 | recognition_engine_->set_delegate(this); |
| 203 | } |
| 204 | |
| 205 | // ------- Methods that trigger Finite State Machine (FSM) events ------------ |
| 206 | |
| 207 | // NOTE:all the external events and requests should be enqueued (PostTask), even |
| 208 | // if they come from the same (IO) thread, in order to preserve the relationship |
| 209 | // of causality between events and avoid interleaved event processing due to |
| 210 | // synchronous callbacks. |
| 211 | |
Ben Murdoch | eb525c5 | 2013-07-10 11:40:50 +0100 | [diff] [blame] | 212 | void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { |
| 213 | DCHECK(!device_id.empty()); |
| 214 | device_id_ = device_id; |
| 215 | |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 216 | BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 217 | base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 218 | this, FSMEventArgs(EVENT_START))); |
| 219 | } |
| 220 | |
| 221 | void SpeechRecognizerImpl::AbortRecognition() { |
| 222 | BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 223 | base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 224 | this, FSMEventArgs(EVENT_ABORT))); |
| 225 | } |
| 226 | |
| 227 | void SpeechRecognizerImpl::StopAudioCapture() { |
| 228 | BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 229 | base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 230 | this, FSMEventArgs(EVENT_STOP_CAPTURE))); |
| 231 | } |
| 232 | |
| 233 | bool SpeechRecognizerImpl::IsActive() const { |
| 234 | // Checking the FSM state from another thread (thus, while the FSM is |
| 235 | // potentially concurrently evolving) is meaningless. |
| 236 | DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 237 | return state_ != STATE_IDLE && state_ != STATE_ENDED; |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 238 | } |
| 239 | |
| 240 | bool SpeechRecognizerImpl::IsCapturingAudio() const { |
| 241 | DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). |
| 242 | const bool is_capturing_audio = state_ >= STATE_STARTING && |
| 243 | state_ <= STATE_RECOGNIZING; |
| 244 | DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || |
| 245 | (!is_capturing_audio && audio_controller_.get() == NULL)); |
| 246 | return is_capturing_audio; |
| 247 | } |
| 248 | |
| 249 | const SpeechRecognitionEngine& |
| 250 | SpeechRecognizerImpl::recognition_engine() const { |
| 251 | return *(recognition_engine_.get()); |
| 252 | } |
| 253 | |
| 254 | SpeechRecognizerImpl::~SpeechRecognizerImpl() { |
| 255 | endpointer_.EndSession(); |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 256 | if (audio_controller_.get()) { |
| 257 | audio_controller_->Close( |
| 258 | base::Bind(&KeepAudioControllerRefcountedForDtor, audio_controller_)); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 259 | } |
| 260 | } |
| 261 | |
| 262 | // Invoked in the audio thread. |
| 263 | void SpeechRecognizerImpl::OnError(AudioInputController* controller) { |
| 264 | FSMEventArgs event_args(EVENT_AUDIO_ERROR); |
| 265 | BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 266 | base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 267 | this, event_args)); |
| 268 | } |
| 269 | |
| 270 | void SpeechRecognizerImpl::OnData(AudioInputController* controller, |
| 271 | const uint8* data, uint32 size) { |
| 272 | if (size == 0) // This could happen when audio capture stops and is normal. |
| 273 | return; |
| 274 | |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 275 | // Convert audio from native format to fixed format used by WebSpeech. |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 276 | FSMEventArgs event_args(EVENT_AUDIO_DATA); |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 277 | event_args.audio_data = audio_converter_->Convert(data, size); |
| 278 | |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 279 | BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 280 | base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 281 | this, event_args)); |
| 282 | } |
| 283 | |
| 284 | void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} |
| 285 | |
| 286 | void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( |
| 287 | const SpeechRecognitionResults& results) { |
| 288 | FSMEventArgs event_args(EVENT_ENGINE_RESULT); |
| 289 | event_args.engine_results = results; |
| 290 | BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 291 | base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 292 | this, event_args)); |
| 293 | } |
| 294 | |
| 295 | void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( |
| 296 | const SpeechRecognitionError& error) { |
| 297 | FSMEventArgs event_args(EVENT_ENGINE_ERROR); |
| 298 | event_args.engine_error = error; |
| 299 | BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, |
| 300 | base::Bind(&SpeechRecognizerImpl::DispatchEvent, |
| 301 | this, event_args)); |
| 302 | } |
| 303 | |
| 304 | // ----------------------- Core FSM implementation --------------------------- |
| 305 | // TODO(primiano): After the changes in the media package (r129173), this class |
| 306 | // slightly violates the SpeechRecognitionEventListener interface contract. In |
| 307 | // particular, it is not true anymore that this class can be freed after the |
| 308 | // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous |
| 309 | // call can be still in progress after the end event. Currently, it does not |
| 310 | // represent a problem for the browser itself, since refcounting protects us |
| 311 | // against such race conditions. However, we should fix this in the next CLs. |
| 312 | // For instance, tests are currently working just because the |
| 313 | // TestAudioInputController is not closing asynchronously as the real controller |
| 314 | // does, but they will become flaky if TestAudioInputController will be fixed. |
| 315 | |
| 316 | void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { |
| 317 | DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); |
| 318 | DCHECK_LE(event_args.event, EVENT_MAX_VALUE); |
| 319 | DCHECK_LE(state_, STATE_MAX_VALUE); |
| 320 | |
| 321 | // Event dispatching must be sequential, otherwise it will break all the rules |
| 322 | // and the assumptions of the finite state automata model. |
| 323 | DCHECK(!is_dispatching_event_); |
| 324 | is_dispatching_event_ = true; |
| 325 | |
| 326 | // Guard against the delegate freeing us until we finish processing the event. |
| 327 | scoped_refptr<SpeechRecognizerImpl> me(this); |
| 328 | |
| 329 | if (event_args.event == EVENT_AUDIO_DATA) { |
| 330 | DCHECK(event_args.audio_data.get() != NULL); |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 331 | ProcessAudioPipeline(*event_args.audio_data.get()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 332 | } |
| 333 | |
| 334 | // The audio pipeline must be processed before the event dispatch, otherwise |
| 335 | // it would take actions according to the future state instead of the current. |
| 336 | state_ = ExecuteTransitionAndGetNextState(event_args); |
| 337 | is_dispatching_event_ = false; |
| 338 | } |
| 339 | |
| 340 | SpeechRecognizerImpl::FSMState |
| 341 | SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( |
| 342 | const FSMEventArgs& event_args) { |
| 343 | const FSMEvent event = event_args.event; |
| 344 | switch (state_) { |
| 345 | case STATE_IDLE: |
| 346 | switch (event) { |
| 347 | // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and |
| 348 | // EVENT_STOP_CAPTURE below once speech input extensions are fixed. |
| 349 | case EVENT_ABORT: |
| 350 | return AbortSilently(event_args); |
| 351 | case EVENT_START: |
| 352 | return StartRecording(event_args); |
| 353 | case EVENT_STOP_CAPTURE: |
| 354 | return AbortSilently(event_args); |
| 355 | case EVENT_AUDIO_DATA: // Corner cases related to queued messages |
| 356 | case EVENT_ENGINE_RESULT: // being lately dispatched. |
| 357 | case EVENT_ENGINE_ERROR: |
| 358 | case EVENT_AUDIO_ERROR: |
| 359 | return DoNothing(event_args); |
| 360 | } |
| 361 | break; |
| 362 | case STATE_STARTING: |
| 363 | switch (event) { |
| 364 | case EVENT_ABORT: |
| 365 | return AbortWithError(event_args); |
| 366 | case EVENT_START: |
| 367 | return NotFeasible(event_args); |
| 368 | case EVENT_STOP_CAPTURE: |
| 369 | return AbortSilently(event_args); |
| 370 | case EVENT_AUDIO_DATA: |
| 371 | return StartRecognitionEngine(event_args); |
| 372 | case EVENT_ENGINE_RESULT: |
| 373 | return NotFeasible(event_args); |
| 374 | case EVENT_ENGINE_ERROR: |
| 375 | case EVENT_AUDIO_ERROR: |
| 376 | return AbortWithError(event_args); |
| 377 | } |
| 378 | break; |
| 379 | case STATE_ESTIMATING_ENVIRONMENT: |
| 380 | switch (event) { |
| 381 | case EVENT_ABORT: |
| 382 | return AbortWithError(event_args); |
| 383 | case EVENT_START: |
| 384 | return NotFeasible(event_args); |
| 385 | case EVENT_STOP_CAPTURE: |
| 386 | return StopCaptureAndWaitForResult(event_args); |
| 387 | case EVENT_AUDIO_DATA: |
| 388 | return WaitEnvironmentEstimationCompletion(event_args); |
| 389 | case EVENT_ENGINE_RESULT: |
| 390 | return ProcessIntermediateResult(event_args); |
| 391 | case EVENT_ENGINE_ERROR: |
| 392 | case EVENT_AUDIO_ERROR: |
| 393 | return AbortWithError(event_args); |
| 394 | } |
| 395 | break; |
| 396 | case STATE_WAITING_FOR_SPEECH: |
| 397 | switch (event) { |
| 398 | case EVENT_ABORT: |
| 399 | return AbortWithError(event_args); |
| 400 | case EVENT_START: |
| 401 | return NotFeasible(event_args); |
| 402 | case EVENT_STOP_CAPTURE: |
| 403 | return StopCaptureAndWaitForResult(event_args); |
| 404 | case EVENT_AUDIO_DATA: |
| 405 | return DetectUserSpeechOrTimeout(event_args); |
| 406 | case EVENT_ENGINE_RESULT: |
| 407 | return ProcessIntermediateResult(event_args); |
| 408 | case EVENT_ENGINE_ERROR: |
| 409 | case EVENT_AUDIO_ERROR: |
| 410 | return AbortWithError(event_args); |
| 411 | } |
| 412 | break; |
| 413 | case STATE_RECOGNIZING: |
| 414 | switch (event) { |
| 415 | case EVENT_ABORT: |
| 416 | return AbortWithError(event_args); |
| 417 | case EVENT_START: |
| 418 | return NotFeasible(event_args); |
| 419 | case EVENT_STOP_CAPTURE: |
| 420 | return StopCaptureAndWaitForResult(event_args); |
| 421 | case EVENT_AUDIO_DATA: |
| 422 | return DetectEndOfSpeech(event_args); |
| 423 | case EVENT_ENGINE_RESULT: |
| 424 | return ProcessIntermediateResult(event_args); |
| 425 | case EVENT_ENGINE_ERROR: |
| 426 | case EVENT_AUDIO_ERROR: |
| 427 | return AbortWithError(event_args); |
| 428 | } |
| 429 | break; |
| 430 | case STATE_WAITING_FINAL_RESULT: |
| 431 | switch (event) { |
| 432 | case EVENT_ABORT: |
| 433 | return AbortWithError(event_args); |
| 434 | case EVENT_START: |
| 435 | return NotFeasible(event_args); |
| 436 | case EVENT_STOP_CAPTURE: |
| 437 | case EVENT_AUDIO_DATA: |
| 438 | return DoNothing(event_args); |
| 439 | case EVENT_ENGINE_RESULT: |
| 440 | return ProcessFinalResult(event_args); |
| 441 | case EVENT_ENGINE_ERROR: |
| 442 | case EVENT_AUDIO_ERROR: |
| 443 | return AbortWithError(event_args); |
| 444 | } |
| 445 | break; |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 446 | |
| 447 | // TODO(primiano): remove this state when speech input extensions support |
| 448 | // will be removed and STATE_IDLE.EVENT_ABORT,EVENT_STOP_CAPTURE will be |
| 449 | // reset to NotFeasible (see TODO above). |
| 450 | case STATE_ENDED: |
| 451 | return DoNothing(event_args); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 452 | } |
| 453 | return NotFeasible(event_args); |
| 454 | } |
| 455 | |
| 456 | // ----------- Contract for all the FSM evolution functions below ------------- |
| 457 | // - Are guaranteed to be executed in the IO thread; |
| 458 | // - Are guaranteed to be not reentrant (themselves and each other); |
| 459 | // - event_args members are guaranteed to be stable during the call; |
| 460 | // - The class won't be freed in the meanwhile due to callbacks; |
| 461 | // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. |
| 462 | |
| 463 | // TODO(primiano): the audio pipeline is currently serial. However, the |
| 464 | // clipper->endpointer->vumeter chain and the sr_engine could be parallelized. |
| 465 | // We should profile the execution to see if it would be worth or not. |
| 466 | void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { |
| 467 | const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && |
| 468 | state_ <= STATE_RECOGNIZING; |
| 469 | const bool route_to_sr_engine = route_to_endpointer; |
| 470 | const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && |
| 471 | state_ <= STATE_RECOGNIZING; |
| 472 | const bool clip_detected = DetectClipping(raw_audio); |
| 473 | float rms = 0.0f; |
| 474 | |
| 475 | num_samples_recorded_ += raw_audio.NumSamples(); |
| 476 | |
| 477 | if (route_to_endpointer) |
| 478 | endpointer_.ProcessAudio(raw_audio, &rms); |
| 479 | |
| 480 | if (route_to_vumeter) { |
| 481 | DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. |
| 482 | UpdateSignalAndNoiseLevels(rms, clip_detected); |
| 483 | } |
| 484 | if (route_to_sr_engine) { |
| 485 | DCHECK(recognition_engine_.get() != NULL); |
| 486 | recognition_engine_->TakeAudioChunk(raw_audio); |
| 487 | } |
| 488 | } |
| 489 | |
| 490 | SpeechRecognizerImpl::FSMState |
| 491 | SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { |
| 492 | DCHECK(recognition_engine_.get() != NULL); |
| 493 | DCHECK(!IsCapturingAudio()); |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 494 | const bool unit_test_is_active = (audio_manager_for_tests_ != NULL); |
| 495 | AudioManager* audio_manager = unit_test_is_active ? |
| 496 | audio_manager_for_tests_ : |
| 497 | AudioManager::Get(); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 498 | DCHECK(audio_manager != NULL); |
| 499 | |
| 500 | DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; |
| 501 | num_samples_recorded_ = 0; |
| 502 | audio_level_ = 0; |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 503 | listener()->OnRecognitionStart(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 504 | |
Ben Murdoch | eb525c5 | 2013-07-10 11:40:50 +0100 | [diff] [blame] | 505 | // TODO(xians): Check if the OS has the device with |device_id_|, return |
| 506 | // |SPEECH_AUDIO_ERROR_DETAILS_NO_MIC| if the target device does not exist. |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 507 | if (!audio_manager->HasAudioInputDevices()) { |
| 508 | return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO, |
| 509 | SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); |
| 510 | } |
| 511 | |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 512 | int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 513 | |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 514 | AudioParameters in_params = audio_manager->GetInputStreamParameters( |
Ben Murdoch | eb525c5 | 2013-07-10 11:40:50 +0100 | [diff] [blame] | 515 | device_id_); |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 516 | if (!in_params.IsValid() && !unit_test_is_active) { |
| 517 | DLOG(ERROR) << "Invalid native audio input parameters"; |
| 518 | return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); |
| 519 | } |
| 520 | |
| 521 | // Audio converter shall provide audio based on these parameters as output. |
| 522 | // Hard coded, WebSpeech specific parameters are utilized here. |
| 523 | int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; |
| 524 | AudioParameters output_parameters = AudioParameters( |
| 525 | AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, |
| 526 | kNumBitsPerAudioSample, frames_per_buffer); |
| 527 | |
| 528 | // Audio converter will receive audio based on these parameters as input. |
| 529 | // On Windows we start by verifying that Core Audio is supported. If not, |
| 530 | // the WaveIn API is used and we might as well avoid all audio conversations |
| 531 | // since WaveIn does the conversion for us. |
| 532 | // TODO(henrika): this code should be moved to platform dependent audio |
| 533 | // managers. |
| 534 | bool use_native_audio_params = true; |
| 535 | #if defined(OS_WIN) |
| 536 | use_native_audio_params = media::CoreAudioUtil::IsSupported(); |
| 537 | DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; |
| 538 | #endif |
| 539 | |
| 540 | AudioParameters input_parameters = output_parameters; |
| 541 | if (use_native_audio_params && !unit_test_is_active) { |
| 542 | // Use native audio parameters but avoid opening up at the native buffer |
| 543 | // size. Instead use same frame size (in milliseconds) as WebSpeech uses. |
| 544 | // We rely on internal buffers in the audio back-end to fulfill this request |
| 545 | // and the idea is to simplify the audio conversion since each Convert() |
| 546 | // call will then render exactly one ProvideInput() call. |
| 547 | // Due to implementation details in the audio converter, 2 milliseconds |
| 548 | // are added to the default frame size (100 ms) to ensure there is enough |
| 549 | // data to generate 100 ms of output when resampling. |
| 550 | frames_per_buffer = |
| 551 | ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; |
| 552 | input_parameters.Reset(in_params.format(), |
| 553 | in_params.channel_layout(), |
| 554 | in_params.channels(), |
| 555 | in_params.input_channels(), |
| 556 | in_params.sample_rate(), |
| 557 | in_params.bits_per_sample(), |
| 558 | frames_per_buffer); |
| 559 | } |
| 560 | |
| 561 | // Create an audio converter which converts data between native input format |
| 562 | // and WebSpeech specific output format. |
| 563 | audio_converter_.reset( |
| 564 | new OnDataConverter(input_parameters, output_parameters)); |
| 565 | |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 566 | audio_controller_ = AudioInputController::Create( |
Ben Murdoch | eb525c5 | 2013-07-10 11:40:50 +0100 | [diff] [blame] | 567 | audio_manager, this, input_parameters, device_id_); |
Torne (Richard Coles) | 7d4cd47 | 2013-06-19 11:58:07 +0100 | [diff] [blame] | 568 | |
Ben Murdoch | eb525c5 | 2013-07-10 11:40:50 +0100 | [diff] [blame] | 569 | if (!audio_controller_.get()) { |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 570 | return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); |
| 571 | } |
| 572 | |
| 573 | // The endpointer needs to estimate the environment/background noise before |
| 574 | // starting to treat the audio as user input. We wait in the state |
| 575 | // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching |
| 576 | // to user input mode. |
| 577 | endpointer_.SetEnvironmentEstimationMode(); |
| 578 | audio_controller_->Record(); |
| 579 | return STATE_STARTING; |
| 580 | } |
| 581 | |
| 582 | SpeechRecognizerImpl::FSMState |
| 583 | SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { |
| 584 | // This is the first audio packet captured, so the recognition engine is |
| 585 | // started and the delegate notified about the event. |
| 586 | DCHECK(recognition_engine_.get() != NULL); |
| 587 | recognition_engine_->StartRecognition(); |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 588 | listener()->OnAudioStart(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 589 | |
| 590 | // This is a little hack, since TakeAudioChunk() is already called by |
| 591 | // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping |
| 592 | // the first audio chunk captured after opening the audio device. |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 593 | recognition_engine_->TakeAudioChunk(*(event_args.audio_data.get())); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 594 | return STATE_ESTIMATING_ENVIRONMENT; |
| 595 | } |
| 596 | |
| 597 | SpeechRecognizerImpl::FSMState |
| 598 | SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { |
| 599 | DCHECK(endpointer_.IsEstimatingEnvironment()); |
| 600 | if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { |
| 601 | endpointer_.SetUserInputMode(); |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 602 | listener()->OnEnvironmentEstimationComplete(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 603 | return STATE_WAITING_FOR_SPEECH; |
| 604 | } else { |
| 605 | return STATE_ESTIMATING_ENVIRONMENT; |
| 606 | } |
| 607 | } |
| 608 | |
| 609 | SpeechRecognizerImpl::FSMState |
| 610 | SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { |
| 611 | if (endpointer_.DidStartReceivingSpeech()) { |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 612 | listener()->OnSoundStart(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 613 | return STATE_RECOGNIZING; |
| 614 | } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { |
| 615 | return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH)); |
| 616 | } |
| 617 | return STATE_WAITING_FOR_SPEECH; |
| 618 | } |
| 619 | |
| 620 | SpeechRecognizerImpl::FSMState |
| 621 | SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { |
| 622 | if (endpointer_.speech_input_complete()) |
| 623 | return StopCaptureAndWaitForResult(event_args); |
| 624 | return STATE_RECOGNIZING; |
| 625 | } |
| 626 | |
| 627 | SpeechRecognizerImpl::FSMState |
| 628 | SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { |
| 629 | DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); |
| 630 | |
| 631 | DVLOG(1) << "Concluding recognition"; |
| 632 | CloseAudioControllerAsynchronously(); |
| 633 | recognition_engine_->AudioChunksEnded(); |
| 634 | |
| 635 | if (state_ > STATE_WAITING_FOR_SPEECH) |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 636 | listener()->OnSoundEnd(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 637 | |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 638 | listener()->OnAudioEnd(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 639 | return STATE_WAITING_FINAL_RESULT; |
| 640 | } |
| 641 | |
| 642 | SpeechRecognizerImpl::FSMState |
| 643 | SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) { |
| 644 | DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR); |
| 645 | DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR); |
| 646 | return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE)); |
| 647 | } |
| 648 | |
| 649 | SpeechRecognizerImpl::FSMState |
| 650 | SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) { |
| 651 | if (event_args.event == EVENT_AUDIO_ERROR) { |
| 652 | return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); |
| 653 | } else if (event_args.event == EVENT_ENGINE_ERROR) { |
| 654 | return Abort(event_args.engine_error); |
| 655 | } |
| 656 | return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); |
| 657 | } |
| 658 | |
| 659 | SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( |
| 660 | const SpeechRecognitionError& error) { |
| 661 | if (IsCapturingAudio()) |
| 662 | CloseAudioControllerAsynchronously(); |
| 663 | |
| 664 | DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; |
| 665 | |
| 666 | // The recognition engine is initialized only after STATE_STARTING. |
| 667 | if (state_ > STATE_STARTING) { |
| 668 | DCHECK(recognition_engine_.get() != NULL); |
| 669 | recognition_engine_->EndRecognition(); |
| 670 | } |
| 671 | |
| 672 | if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 673 | listener()->OnSoundEnd(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 674 | |
| 675 | if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 676 | listener()->OnAudioEnd(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 677 | |
| 678 | if (error.code != SPEECH_RECOGNITION_ERROR_NONE) |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 679 | listener()->OnRecognitionError(session_id(), error); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 680 | |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 681 | listener()->OnRecognitionEnd(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 682 | |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 683 | return STATE_ENDED; |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 684 | } |
| 685 | |
| 686 | SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult( |
| 687 | const FSMEventArgs& event_args) { |
| 688 | // Provisional results can occur only during continuous (non one-shot) mode. |
| 689 | // If this check is reached it means that a continuous speech recognition |
| 690 | // engine is being used for a one shot recognition. |
| 691 | DCHECK_EQ(false, is_single_shot_); |
| 692 | |
| 693 | // In continuous recognition, intermediate results can occur even when we are |
| 694 | // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the |
| 695 | // recognition engine is "faster" than our endpointer). In these cases we |
| 696 | // skip the endpointer and fast-forward to the RECOGNIZING state, with respect |
| 697 | // of the events triggering order. |
| 698 | if (state_ == STATE_ESTIMATING_ENVIRONMENT) { |
| 699 | DCHECK(endpointer_.IsEstimatingEnvironment()); |
| 700 | endpointer_.SetUserInputMode(); |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 701 | listener()->OnEnvironmentEstimationComplete(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 702 | } else if (state_ == STATE_WAITING_FOR_SPEECH) { |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 703 | listener()->OnSoundStart(session_id()); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 704 | } else { |
| 705 | DCHECK_EQ(STATE_RECOGNIZING, state_); |
| 706 | } |
| 707 | |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 708 | listener()->OnRecognitionResults(session_id(), event_args.engine_results); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 709 | return STATE_RECOGNIZING; |
| 710 | } |
| 711 | |
| 712 | SpeechRecognizerImpl::FSMState |
| 713 | SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { |
| 714 | const SpeechRecognitionResults& results = event_args.engine_results; |
| 715 | SpeechRecognitionResults::const_iterator i = results.begin(); |
| 716 | bool provisional_results_pending = false; |
| 717 | bool results_are_empty = true; |
| 718 | for (; i != results.end(); ++i) { |
| 719 | const SpeechRecognitionResult& result = *i; |
| 720 | if (result.is_provisional) { |
| 721 | provisional_results_pending = true; |
| 722 | DCHECK(!is_single_shot_); |
| 723 | } else if (results_are_empty) { |
| 724 | results_are_empty = result.hypotheses.empty(); |
| 725 | } |
| 726 | } |
| 727 | |
| 728 | if (provisional_results_pending) { |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 729 | listener()->OnRecognitionResults(session_id(), results); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 730 | // We don't end the recognition if a provisional result is received in |
| 731 | // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will |
| 732 | // end the recognition. |
| 733 | return state_; |
| 734 | } |
| 735 | |
| 736 | recognition_engine_->EndRecognition(); |
| 737 | |
| 738 | if (!results_are_empty) { |
| 739 | // We could receive an empty result (which we won't propagate further) |
| 740 | // in the following (continuous) scenario: |
| 741 | // 1. The caller start pushing audio and receives some results; |
| 742 | // 2. A |StopAudioCapture| is issued later; |
| 743 | // 3. The final audio frames captured in the interval ]1,2] do not lead to |
| 744 | // any result (nor any error); |
| 745 | // 4. The speech recognition engine, therefore, emits an empty result to |
| 746 | // notify that the recognition is ended with no error, yet neither any |
| 747 | // further result. |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 748 | listener()->OnRecognitionResults(session_id(), results); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 749 | } |
| 750 | |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 751 | listener()->OnRecognitionEnd(session_id()); |
| 752 | return STATE_ENDED; |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 753 | } |
| 754 | |
| 755 | SpeechRecognizerImpl::FSMState |
| 756 | SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { |
| 757 | return state_; // Just keep the current state. |
| 758 | } |
| 759 | |
| 760 | SpeechRecognizerImpl::FSMState |
| 761 | SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { |
| 762 | NOTREACHED() << "Unfeasible event " << event_args.event |
| 763 | << " in state " << state_; |
| 764 | return state_; |
| 765 | } |
| 766 | |
| 767 | void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { |
| 768 | DCHECK(IsCapturingAudio()); |
| 769 | DVLOG(1) << "SpeechRecognizerImpl closing audio controller."; |
| 770 | // Issues a Close on the audio controller, passing an empty callback. The only |
| 771 | // purpose of such callback is to keep the audio controller refcounted until |
| 772 | // Close has completed (in the audio thread) and automatically destroy it |
| 773 | // afterwards (upon return from OnAudioClosed). |
| 774 | audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, |
| 775 | this, audio_controller_)); |
| 776 | audio_controller_ = NULL; // The controller is still refcounted by Bind. |
| 777 | } |
| 778 | |
| 779 | int SpeechRecognizerImpl::GetElapsedTimeMs() const { |
| 780 | return (num_samples_recorded_ * 1000) / kAudioSampleRate; |
| 781 | } |
| 782 | |
| 783 | void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, |
| 784 | bool clip_detected) { |
| 785 | // Calculate the input volume to display in the UI, smoothing towards the |
| 786 | // new level. |
| 787 | // TODO(primiano): Do we really need all this floating point arith here? |
| 788 | // Perhaps it might be quite expensive on mobile. |
| 789 | float level = (rms - kAudioMeterMinDb) / |
| 790 | (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
| 791 | level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); |
| 792 | const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : |
| 793 | kDownSmoothingFactor; |
| 794 | audio_level_ += (level - audio_level_) * smoothing_factor; |
| 795 | |
| 796 | float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / |
| 797 | (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); |
| 798 | noise_level = std::min(std::max(0.0f, noise_level), |
| 799 | kAudioMeterRangeMaxUnclipped); |
| 800 | |
Torne (Richard Coles) | 868fa2f | 2013-06-11 10:57:03 +0100 | [diff] [blame] | 801 | listener()->OnAudioLevelsChange( |
| 802 | session_id(), clip_detected ? 1.0f : audio_level_, noise_level); |
Torne (Richard Coles) | 90dce4d | 2013-05-29 14:40:03 +0100 | [diff] [blame] | 803 | } |
| 804 | |
| 805 | void SpeechRecognizerImpl::SetAudioManagerForTests( |
| 806 | AudioManager* audio_manager) { |
| 807 | audio_manager_for_tests_ = audio_manager; |
| 808 | } |
| 809 | |
| 810 | SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) |
| 811 | : event(event_value), |
| 812 | audio_data(NULL), |
| 813 | engine_error(SPEECH_RECOGNITION_ERROR_NONE) { |
| 814 | } |
| 815 | |
| 816 | SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { |
| 817 | } |
| 818 | |
| 819 | } // namespace content |