Blame - content/browser/speech/speech_recognizer_impl.cc - fp2-dev/platform/external/chromium_org

blob: 2081b2f982d4a3be7703f8c9e237fff72734d76a [file] [log] [blame]

Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	1	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "content/browser/speech/speech_recognizer_impl.h"
				6
				7	#include "base/basictypes.h"
				8	#include "base/bind.h"
Ben Murdoch	eb525c5	2013-07-10 11:40:50 +0100	[diff] [blame]	9	#include "base/time/time.h"
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	10	#include "content/browser/browser_main_loop.h"
				11	#include "content/browser/speech/audio_buffer.h"
				12	#include "content/browser/speech/google_one_shot_remote_engine.h"
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	13	#include "content/public/browser/speech_recognition_event_listener.h"
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	14	#include "media/base/audio_converter.h"
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	15	#include "net/url_request/url_request_context_getter.h"
				16
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	17	#if defined(OS_WIN)
				18	#include "media/audio/win/core_audio_util_win.h"
				19	#endif
				20
				21	using media::AudioBus;
				22	using media::AudioConverter;
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	23	using media::AudioInputController;
				24	using media::AudioManager;
				25	using media::AudioParameters;
				26	using media::ChannelLayout;
				27
				28	namespace content {
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	29
				30	// Private class which encapsulates the audio converter and the
				31	// AudioConverter::InputCallback. It handles resampling, buffering and
				32	// channel mixing between input and output parameters.
				33	class SpeechRecognizerImpl::OnDataConverter
				34	: public media::AudioConverter::InputCallback {
				35	public:
				36	OnDataConverter(const AudioParameters& input_params,
				37	const AudioParameters& output_params);
				38	virtual ~OnDataConverter();
				39
				40	// Converts input \|data\| buffer into an AudioChunk where the input format
				41	// is given by \|input_parameters_\| and the output format by
				42	// \|output_parameters_\|.
				43	scoped_refptr<AudioChunk> Convert(const uint8* data, size_t size);
				44
				45	private:
				46	// media::AudioConverter::InputCallback implementation.
				47	virtual double ProvideInput(AudioBus* dest,
				48	base::TimeDelta buffer_delay) OVERRIDE;
				49
				50	// Handles resampling, buffering, and channel mixing between input and output
				51	// parameters.
				52	AudioConverter audio_converter_;
				53
				54	scoped_ptr<AudioBus> input_bus_;
				55	scoped_ptr<AudioBus> output_bus_;
				56	const AudioParameters input_parameters_;
				57	const AudioParameters output_parameters_;
				58	bool waiting_for_input_;
				59	scoped_ptr<uint8[]> converted_data_;
				60
				61	DISALLOW_COPY_AND_ASSIGN(OnDataConverter);
				62	};
				63
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	64	namespace {
				65
				66	// The following constants are related to the volume level indicator shown in
				67	// the UI for recorded audio.
				68	// Multiplier used when new volume is greater than previous level.
				69	const float kUpSmoothingFactor = 1.0f;
				70	// Multiplier used when new volume is lesser than previous level.
				71	const float kDownSmoothingFactor = 0.7f;
				72	// RMS dB value of a maximum (unclipped) sine wave for int16 samples.
				73	const float kAudioMeterMaxDb = 90.31f;
				74	// This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.
				75	// Values lower than this will display as empty level-meter.
				76	const float kAudioMeterMinDb = 30.0f;
				77	const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;
				78
				79	// Maximum level to draw to display unclipped meter. (1.0f displays clipping.)
				80	const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;
				81
				82	// Returns true if more than 5% of the samples are at min or max value.
				83	bool DetectClipping(const AudioChunk& chunk) {
				84	const int num_samples = chunk.NumSamples();
				85	const int16* samples = chunk.SamplesData16();
				86	const int kThreshold = num_samples / 20;
				87	int clipping_samples = 0;
				88
				89	for (int i = 0; i < num_samples; ++i) {
				90	if (samples[i] <= -32767 \|\| samples[i] >= 32767) {
				91	if (++clipping_samples > kThreshold)
				92	return true;
				93	}
				94	}
				95	return false;
				96	}
				97
				98	void KeepAudioControllerRefcountedForDtor(scoped_refptr<AudioInputController>) {
				99	}
				100
				101	} // namespace
				102
				103	const int SpeechRecognizerImpl::kAudioSampleRate = 16000;
				104	const ChannelLayout SpeechRecognizerImpl::kChannelLayout =
				105	media::CHANNEL_LAYOUT_MONO;
				106	const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;
				107	const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;
				108	const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;
				109	media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL;
				110
				111	COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,
				112	kNumBitsPerAudioSample_must_be_a_multiple_of_8);
				113
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	114	// SpeechRecognizerImpl::OnDataConverter implementation
				115
				116	SpeechRecognizerImpl::OnDataConverter::OnDataConverter(
				117	const AudioParameters& input_params, const AudioParameters& output_params)
				118	: audio_converter_(input_params, output_params, false),
				119	input_bus_(AudioBus::Create(input_params)),
				120	output_bus_(AudioBus::Create(output_params)),
				121	input_parameters_(input_params),
				122	output_parameters_(output_params),
				123	waiting_for_input_(false),
				124	converted_data_(new uint8[output_parameters_.GetBytesPerBuffer()]) {
				125	audio_converter_.AddInput(this);
				126	}
				127
				128	SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {
				129	// It should now be safe to unregister the converter since no more OnData()
				130	// callbacks are outstanding at this point.
				131	audio_converter_.RemoveInput(this);
				132	}
				133
				134	scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(
				135	const uint8* data, size_t size) {
				136	CHECK_EQ(size, static_cast<size_t>(input_parameters_.GetBytesPerBuffer()));
				137
				138	input_bus_->FromInterleaved(
				139	data, input_bus_->frames(), input_parameters_.bits_per_sample() / 8);
				140
				141	waiting_for_input_ = true;
				142	audio_converter_.Convert(output_bus_.get());
				143
				144	output_bus_->ToInterleaved(
				145	output_bus_->frames(), output_parameters_.bits_per_sample() / 8,
				146	converted_data_.get());
				147
				148	// TODO(primiano): Refactor AudioChunk to avoid the extra-copy here
				149	// (see http://crbug.com/249316 for details).
				150	return scoped_refptr<AudioChunk>(new AudioChunk(
				151	converted_data_.get(),
				152	output_parameters_.GetBytesPerBuffer(),
				153	output_parameters_.bits_per_sample() / 8));
				154	}
				155
				156	double SpeechRecognizerImpl::OnDataConverter::ProvideInput(
				157	AudioBus* dest, base::TimeDelta buffer_delay) {
				158	// The audio converted should never ask for more than one bus in each call
				159	// to Convert(). If so, we have a serious issue in our design since we might
				160	// miss recorded chunks of 100 ms audio data.
				161	CHECK(waiting_for_input_);
				162
				163	// Read from the input bus to feed the converter.
				164	input_bus_->CopyTo(dest);
				165
				166	// \|input_bus_\| should only be provide once.
				167	waiting_for_input_ = false;
				168	return 1;
				169	}
				170
				171	// SpeechRecognizerImpl implementation
				172
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	173	SpeechRecognizerImpl::SpeechRecognizerImpl(
				174	SpeechRecognitionEventListener* listener,
				175	int session_id,
				176	bool is_single_shot,
				177	SpeechRecognitionEngine* engine)
				178	: SpeechRecognizer(listener, session_id),
				179	recognition_engine_(engine),
				180	endpointer_(kAudioSampleRate),
				181	is_dispatching_event_(false),
				182	is_single_shot_(is_single_shot),
				183	state_(STATE_IDLE) {
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	184	DCHECK(recognition_engine_ != NULL);
				185	if (is_single_shot) {
				186	// In single shot recognition, the session is automatically ended after:
				187	// - 0.5 seconds of silence if time < 3 seconds
				188	// - 1 seconds of silence if time >= 3 seconds
				189	endpointer_.set_speech_input_complete_silence_length(
				190	base::Time::kMicrosecondsPerSecond / 2);
				191	endpointer_.set_long_speech_input_complete_silence_length(
				192	base::Time::kMicrosecondsPerSecond);
				193	endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
				194	} else {
				195	// In continuous recognition, the session is automatically ended after 15
				196	// seconds of silence.
				197	const int64 cont_timeout_us = base::Time::kMicrosecondsPerSecond * 15;
				198	endpointer_.set_speech_input_complete_silence_length(cont_timeout_us);
				199	endpointer_.set_long_speech_length(0); // Use only a single timeout.
				200	}
				201	endpointer_.StartSession();
				202	recognition_engine_->set_delegate(this);
				203	}
				204
				205	// ------- Methods that trigger Finite State Machine (FSM) events ------------
				206
				207	// NOTE:all the external events and requests should be enqueued (PostTask), even
				208	// if they come from the same (IO) thread, in order to preserve the relationship
				209	// of causality between events and avoid interleaved event processing due to
				210	// synchronous callbacks.
				211
Ben Murdoch	eb525c5	2013-07-10 11:40:50 +0100	[diff] [blame]	212	void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) {
				213	DCHECK(!device_id.empty());
				214	device_id_ = device_id;
				215
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	216	BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
				217	base::Bind(&SpeechRecognizerImpl::DispatchEvent,
				218	this, FSMEventArgs(EVENT_START)));
				219	}
				220
				221	void SpeechRecognizerImpl::AbortRecognition() {
				222	BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
				223	base::Bind(&SpeechRecognizerImpl::DispatchEvent,
				224	this, FSMEventArgs(EVENT_ABORT)));
				225	}
				226
				227	void SpeechRecognizerImpl::StopAudioCapture() {
				228	BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
				229	base::Bind(&SpeechRecognizerImpl::DispatchEvent,
				230	this, FSMEventArgs(EVENT_STOP_CAPTURE)));
				231	}
				232
				233	bool SpeechRecognizerImpl::IsActive() const {
				234	// Checking the FSM state from another thread (thus, while the FSM is
				235	// potentially concurrently evolving) is meaningless.
				236	DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	237	return state_ != STATE_IDLE && state_ != STATE_ENDED;
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	238	}
				239
				240	bool SpeechRecognizerImpl::IsCapturingAudio() const {
				241	DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().
				242	const bool is_capturing_audio = state_ >= STATE_STARTING &&
				243	state_ <= STATE_RECOGNIZING;
				244	DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) \|\|
				245	(!is_capturing_audio && audio_controller_.get() == NULL));
				246	return is_capturing_audio;
				247	}
				248
				249	const SpeechRecognitionEngine&
				250	SpeechRecognizerImpl::recognition_engine() const {
				251	return *(recognition_engine_.get());
				252	}
				253
				254	SpeechRecognizerImpl::~SpeechRecognizerImpl() {
				255	endpointer_.EndSession();
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	256	if (audio_controller_.get()) {
				257	audio_controller_->Close(
				258	base::Bind(&KeepAudioControllerRefcountedForDtor, audio_controller_));
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	259	}
				260	}
				261
				262	// Invoked in the audio thread.
				263	void SpeechRecognizerImpl::OnError(AudioInputController* controller) {
				264	FSMEventArgs event_args(EVENT_AUDIO_ERROR);
				265	BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
				266	base::Bind(&SpeechRecognizerImpl::DispatchEvent,
				267	this, event_args));
				268	}
				269
				270	void SpeechRecognizerImpl::OnData(AudioInputController* controller,
				271	const uint8* data, uint32 size) {
				272	if (size == 0) // This could happen when audio capture stops and is normal.
				273	return;
				274
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	275	// Convert audio from native format to fixed format used by WebSpeech.
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	276	FSMEventArgs event_args(EVENT_AUDIO_DATA);
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	277	event_args.audio_data = audio_converter_->Convert(data, size);
				278
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	279	BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
				280	base::Bind(&SpeechRecognizerImpl::DispatchEvent,
				281	this, event_args));
				282	}
				283
				284	void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}
				285
				286	void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults(
				287	const SpeechRecognitionResults& results) {
				288	FSMEventArgs event_args(EVENT_ENGINE_RESULT);
				289	event_args.engine_results = results;
				290	BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
				291	base::Bind(&SpeechRecognizerImpl::DispatchEvent,
				292	this, event_args));
				293	}
				294
				295	void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(
				296	const SpeechRecognitionError& error) {
				297	FSMEventArgs event_args(EVENT_ENGINE_ERROR);
				298	event_args.engine_error = error;
				299	BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
				300	base::Bind(&SpeechRecognizerImpl::DispatchEvent,
				301	this, event_args));
				302	}
				303
				304	// ----------------------- Core FSM implementation ---------------------------
				305	// TODO(primiano): After the changes in the media package (r129173), this class
				306	// slightly violates the SpeechRecognitionEventListener interface contract. In
				307	// particular, it is not true anymore that this class can be freed after the
				308	// OnRecognitionEnd event, since the audio_controller_.Close() asynchronous
				309	// call can be still in progress after the end event. Currently, it does not
				310	// represent a problem for the browser itself, since refcounting protects us
				311	// against such race conditions. However, we should fix this in the next CLs.
				312	// For instance, tests are currently working just because the
				313	// TestAudioInputController is not closing asynchronously as the real controller
				314	// does, but they will become flaky if TestAudioInputController will be fixed.
				315
				316	void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) {
				317	DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
				318	DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
				319	DCHECK_LE(state_, STATE_MAX_VALUE);
				320
				321	// Event dispatching must be sequential, otherwise it will break all the rules
				322	// and the assumptions of the finite state automata model.
				323	DCHECK(!is_dispatching_event_);
				324	is_dispatching_event_ = true;
				325
				326	// Guard against the delegate freeing us until we finish processing the event.
				327	scoped_refptr<SpeechRecognizerImpl> me(this);
				328
				329	if (event_args.event == EVENT_AUDIO_DATA) {
				330	DCHECK(event_args.audio_data.get() != NULL);
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	331	ProcessAudioPipeline(*event_args.audio_data.get());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	332	}
				333
				334	// The audio pipeline must be processed before the event dispatch, otherwise
				335	// it would take actions according to the future state instead of the current.
				336	state_ = ExecuteTransitionAndGetNextState(event_args);
				337	is_dispatching_event_ = false;
				338	}
				339
				340	SpeechRecognizerImpl::FSMState
				341	SpeechRecognizerImpl::ExecuteTransitionAndGetNextState(
				342	const FSMEventArgs& event_args) {
				343	const FSMEvent event = event_args.event;
				344	switch (state_) {
				345	case STATE_IDLE:
				346	switch (event) {
				347	// TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and
				348	// EVENT_STOP_CAPTURE below once speech input extensions are fixed.
				349	case EVENT_ABORT:
				350	return AbortSilently(event_args);
				351	case EVENT_START:
				352	return StartRecording(event_args);
				353	case EVENT_STOP_CAPTURE:
				354	return AbortSilently(event_args);
				355	case EVENT_AUDIO_DATA: // Corner cases related to queued messages
				356	case EVENT_ENGINE_RESULT: // being lately dispatched.
				357	case EVENT_ENGINE_ERROR:
				358	case EVENT_AUDIO_ERROR:
				359	return DoNothing(event_args);
				360	}
				361	break;
				362	case STATE_STARTING:
				363	switch (event) {
				364	case EVENT_ABORT:
				365	return AbortWithError(event_args);
				366	case EVENT_START:
				367	return NotFeasible(event_args);
				368	case EVENT_STOP_CAPTURE:
				369	return AbortSilently(event_args);
				370	case EVENT_AUDIO_DATA:
				371	return StartRecognitionEngine(event_args);
				372	case EVENT_ENGINE_RESULT:
				373	return NotFeasible(event_args);
				374	case EVENT_ENGINE_ERROR:
				375	case EVENT_AUDIO_ERROR:
				376	return AbortWithError(event_args);
				377	}
				378	break;
				379	case STATE_ESTIMATING_ENVIRONMENT:
				380	switch (event) {
				381	case EVENT_ABORT:
				382	return AbortWithError(event_args);
				383	case EVENT_START:
				384	return NotFeasible(event_args);
				385	case EVENT_STOP_CAPTURE:
				386	return StopCaptureAndWaitForResult(event_args);
				387	case EVENT_AUDIO_DATA:
				388	return WaitEnvironmentEstimationCompletion(event_args);
				389	case EVENT_ENGINE_RESULT:
				390	return ProcessIntermediateResult(event_args);
				391	case EVENT_ENGINE_ERROR:
				392	case EVENT_AUDIO_ERROR:
				393	return AbortWithError(event_args);
				394	}
				395	break;
				396	case STATE_WAITING_FOR_SPEECH:
				397	switch (event) {
				398	case EVENT_ABORT:
				399	return AbortWithError(event_args);
				400	case EVENT_START:
				401	return NotFeasible(event_args);
				402	case EVENT_STOP_CAPTURE:
				403	return StopCaptureAndWaitForResult(event_args);
				404	case EVENT_AUDIO_DATA:
				405	return DetectUserSpeechOrTimeout(event_args);
				406	case EVENT_ENGINE_RESULT:
				407	return ProcessIntermediateResult(event_args);
				408	case EVENT_ENGINE_ERROR:
				409	case EVENT_AUDIO_ERROR:
				410	return AbortWithError(event_args);
				411	}
				412	break;
				413	case STATE_RECOGNIZING:
				414	switch (event) {
				415	case EVENT_ABORT:
				416	return AbortWithError(event_args);
				417	case EVENT_START:
				418	return NotFeasible(event_args);
				419	case EVENT_STOP_CAPTURE:
				420	return StopCaptureAndWaitForResult(event_args);
				421	case EVENT_AUDIO_DATA:
				422	return DetectEndOfSpeech(event_args);
				423	case EVENT_ENGINE_RESULT:
				424	return ProcessIntermediateResult(event_args);
				425	case EVENT_ENGINE_ERROR:
				426	case EVENT_AUDIO_ERROR:
				427	return AbortWithError(event_args);
				428	}
				429	break;
				430	case STATE_WAITING_FINAL_RESULT:
				431	switch (event) {
				432	case EVENT_ABORT:
				433	return AbortWithError(event_args);
				434	case EVENT_START:
				435	return NotFeasible(event_args);
				436	case EVENT_STOP_CAPTURE:
				437	case EVENT_AUDIO_DATA:
				438	return DoNothing(event_args);
				439	case EVENT_ENGINE_RESULT:
				440	return ProcessFinalResult(event_args);
				441	case EVENT_ENGINE_ERROR:
				442	case EVENT_AUDIO_ERROR:
				443	return AbortWithError(event_args);
				444	}
				445	break;
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	446
				447	// TODO(primiano): remove this state when speech input extensions support
				448	// will be removed and STATE_IDLE.EVENT_ABORT,EVENT_STOP_CAPTURE will be
				449	// reset to NotFeasible (see TODO above).
				450	case STATE_ENDED:
				451	return DoNothing(event_args);
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	452	}
				453	return NotFeasible(event_args);
				454	}
				455
				456	// ----------- Contract for all the FSM evolution functions below -------------
				457	// - Are guaranteed to be executed in the IO thread;
				458	// - Are guaranteed to be not reentrant (themselves and each other);
				459	// - event_args members are guaranteed to be stable during the call;
				460	// - The class won't be freed in the meanwhile due to callbacks;
				461	// - IsCapturingAudio() returns true if and only if audio_controller_ != NULL.
				462
				463	// TODO(primiano): the audio pipeline is currently serial. However, the
				464	// clipper->endpointer->vumeter chain and the sr_engine could be parallelized.
				465	// We should profile the execution to see if it would be worth or not.
				466	void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) {
				467	const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT &&
				468	state_ <= STATE_RECOGNIZING;
				469	const bool route_to_sr_engine = route_to_endpointer;
				470	const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH &&
				471	state_ <= STATE_RECOGNIZING;
				472	const bool clip_detected = DetectClipping(raw_audio);
				473	float rms = 0.0f;
				474
				475	num_samples_recorded_ += raw_audio.NumSamples();
				476
				477	if (route_to_endpointer)
				478	endpointer_.ProcessAudio(raw_audio, &rms);
				479
				480	if (route_to_vumeter) {
				481	DCHECK(route_to_endpointer); // Depends on endpointer due to \|rms\|.
				482	UpdateSignalAndNoiseLevels(rms, clip_detected);
				483	}
				484	if (route_to_sr_engine) {
				485	DCHECK(recognition_engine_.get() != NULL);
				486	recognition_engine_->TakeAudioChunk(raw_audio);
				487	}
				488	}
				489
				490	SpeechRecognizerImpl::FSMState
				491	SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {
				492	DCHECK(recognition_engine_.get() != NULL);
				493	DCHECK(!IsCapturingAudio());
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	494	const bool unit_test_is_active = (audio_manager_for_tests_ != NULL);
				495	AudioManager* audio_manager = unit_test_is_active ?
				496	audio_manager_for_tests_ :
				497	AudioManager::Get();
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	498	DCHECK(audio_manager != NULL);
				499
				500	DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";
				501	num_samples_recorded_ = 0;
				502	audio_level_ = 0;
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	503	listener()->OnRecognitionStart(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	504
Ben Murdoch	eb525c5	2013-07-10 11:40:50 +0100	[diff] [blame]	505	// TODO(xians): Check if the OS has the device with \|device_id_\|, return
				506	// \|SPEECH_AUDIO_ERROR_DETAILS_NO_MIC\| if the target device does not exist.
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	507	if (!audio_manager->HasAudioInputDevices()) {
				508	return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO,
				509	SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));
				510	}
				511
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	512	int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs();
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	513
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	514	AudioParameters in_params = audio_manager->GetInputStreamParameters(
Ben Murdoch	eb525c5	2013-07-10 11:40:50 +0100	[diff] [blame]	515	device_id_);
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	516	if (!in_params.IsValid() && !unit_test_is_active) {
				517	DLOG(ERROR) << "Invalid native audio input parameters";
				518	return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO));
				519	}
				520
				521	// Audio converter shall provide audio based on these parameters as output.
				522	// Hard coded, WebSpeech specific parameters are utilized here.
				523	int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000;
				524	AudioParameters output_parameters = AudioParameters(
				525	AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate,
				526	kNumBitsPerAudioSample, frames_per_buffer);
				527
				528	// Audio converter will receive audio based on these parameters as input.
				529	// On Windows we start by verifying that Core Audio is supported. If not,
				530	// the WaveIn API is used and we might as well avoid all audio conversations
				531	// since WaveIn does the conversion for us.
				532	// TODO(henrika): this code should be moved to platform dependent audio
				533	// managers.
				534	bool use_native_audio_params = true;
				535	#if defined(OS_WIN)
				536	use_native_audio_params = media::CoreAudioUtil::IsSupported();
				537	DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech";
				538	#endif
				539
				540	AudioParameters input_parameters = output_parameters;
				541	if (use_native_audio_params && !unit_test_is_active) {
				542	// Use native audio parameters but avoid opening up at the native buffer
				543	// size. Instead use same frame size (in milliseconds) as WebSpeech uses.
				544	// We rely on internal buffers in the audio back-end to fulfill this request
				545	// and the idea is to simplify the audio conversion since each Convert()
				546	// call will then render exactly one ProvideInput() call.
				547	// Due to implementation details in the audio converter, 2 milliseconds
				548	// are added to the default frame size (100 ms) to ensure there is enough
				549	// data to generate 100 ms of output when resampling.
				550	frames_per_buffer =
				551	((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5;
				552	input_parameters.Reset(in_params.format(),
				553	in_params.channel_layout(),
				554	in_params.channels(),
				555	in_params.input_channels(),
				556	in_params.sample_rate(),
				557	in_params.bits_per_sample(),
				558	frames_per_buffer);
				559	}
				560
				561	// Create an audio converter which converts data between native input format
				562	// and WebSpeech specific output format.
				563	audio_converter_.reset(
				564	new OnDataConverter(input_parameters, output_parameters));
				565
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	566	audio_controller_ = AudioInputController::Create(
Ben Murdoch	eb525c5	2013-07-10 11:40:50 +0100	[diff] [blame]	567	audio_manager, this, input_parameters, device_id_);
Torne (Richard Coles)	7d4cd47	2013-06-19 11:58:07 +0100	[diff] [blame]	568
Ben Murdoch	eb525c5	2013-07-10 11:40:50 +0100	[diff] [blame]	569	if (!audio_controller_.get()) {
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	570	return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO));
				571	}
				572
				573	// The endpointer needs to estimate the environment/background noise before
				574	// starting to treat the audio as user input. We wait in the state
				575	// ESTIMATING_ENVIRONMENT until such interval has elapsed before switching
				576	// to user input mode.
				577	endpointer_.SetEnvironmentEstimationMode();
				578	audio_controller_->Record();
				579	return STATE_STARTING;
				580	}
				581
				582	SpeechRecognizerImpl::FSMState
				583	SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {
				584	// This is the first audio packet captured, so the recognition engine is
				585	// started and the delegate notified about the event.
				586	DCHECK(recognition_engine_.get() != NULL);
				587	recognition_engine_->StartRecognition();
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	588	listener()->OnAudioStart(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	589
				590	// This is a little hack, since TakeAudioChunk() is already called by
				591	// ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping
				592	// the first audio chunk captured after opening the audio device.
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	593	recognition_engine_->TakeAudioChunk(*(event_args.audio_data.get()));
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	594	return STATE_ESTIMATING_ENVIRONMENT;
				595	}
				596
				597	SpeechRecognizerImpl::FSMState
				598	SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {
				599	DCHECK(endpointer_.IsEstimatingEnvironment());
				600	if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {
				601	endpointer_.SetUserInputMode();
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	602	listener()->OnEnvironmentEstimationComplete(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	603	return STATE_WAITING_FOR_SPEECH;
				604	} else {
				605	return STATE_ESTIMATING_ENVIRONMENT;
				606	}
				607	}
				608
				609	SpeechRecognizerImpl::FSMState
				610	SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {
				611	if (endpointer_.DidStartReceivingSpeech()) {
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	612	listener()->OnSoundStart(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	613	return STATE_RECOGNIZING;
				614	} else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {
				615	return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH));
				616	}
				617	return STATE_WAITING_FOR_SPEECH;
				618	}
				619
				620	SpeechRecognizerImpl::FSMState
				621	SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {
				622	if (endpointer_.speech_input_complete())
				623	return StopCaptureAndWaitForResult(event_args);
				624	return STATE_RECOGNIZING;
				625	}
				626
				627	SpeechRecognizerImpl::FSMState
				628	SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {
				629	DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);
				630
				631	DVLOG(1) << "Concluding recognition";
				632	CloseAudioControllerAsynchronously();
				633	recognition_engine_->AudioChunksEnded();
				634
				635	if (state_ > STATE_WAITING_FOR_SPEECH)
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	636	listener()->OnSoundEnd(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	637
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	638	listener()->OnAudioEnd(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	639	return STATE_WAITING_FINAL_RESULT;
				640	}
				641
				642	SpeechRecognizerImpl::FSMState
				643	SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) {
				644	DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR);
				645	DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR);
				646	return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE));
				647	}
				648
				649	SpeechRecognizerImpl::FSMState
				650	SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) {
				651	if (event_args.event == EVENT_AUDIO_ERROR) {
				652	return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO));
				653	} else if (event_args.event == EVENT_ENGINE_ERROR) {
				654	return Abort(event_args.engine_error);
				655	}
				656	return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED));
				657	}
				658
				659	SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(
				660	const SpeechRecognitionError& error) {
				661	if (IsCapturingAudio())
				662	CloseAudioControllerAsynchronously();
				663
				664	DVLOG(1) << "SpeechRecognizerImpl canceling recognition. ";
				665
				666	// The recognition engine is initialized only after STATE_STARTING.
				667	if (state_ > STATE_STARTING) {
				668	DCHECK(recognition_engine_.get() != NULL);
				669	recognition_engine_->EndRecognition();
				670	}
				671
				672	if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	673	listener()->OnSoundEnd(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	674
				675	if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	676	listener()->OnAudioEnd(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	677
				678	if (error.code != SPEECH_RECOGNITION_ERROR_NONE)
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	679	listener()->OnRecognitionError(session_id(), error);
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	680
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	681	listener()->OnRecognitionEnd(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	682
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	683	return STATE_ENDED;
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	684	}
				685
				686	SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult(
				687	const FSMEventArgs& event_args) {
				688	// Provisional results can occur only during continuous (non one-shot) mode.
				689	// If this check is reached it means that a continuous speech recognition
				690	// engine is being used for a one shot recognition.
				691	DCHECK_EQ(false, is_single_shot_);
				692
				693	// In continuous recognition, intermediate results can occur even when we are
				694	// in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the
				695	// recognition engine is "faster" than our endpointer). In these cases we
				696	// skip the endpointer and fast-forward to the RECOGNIZING state, with respect
				697	// of the events triggering order.
				698	if (state_ == STATE_ESTIMATING_ENVIRONMENT) {
				699	DCHECK(endpointer_.IsEstimatingEnvironment());
				700	endpointer_.SetUserInputMode();
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	701	listener()->OnEnvironmentEstimationComplete(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	702	} else if (state_ == STATE_WAITING_FOR_SPEECH) {
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	703	listener()->OnSoundStart(session_id());
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	704	} else {
				705	DCHECK_EQ(STATE_RECOGNIZING, state_);
				706	}
				707
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	708	listener()->OnRecognitionResults(session_id(), event_args.engine_results);
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	709	return STATE_RECOGNIZING;
				710	}
				711
				712	SpeechRecognizerImpl::FSMState
				713	SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {
				714	const SpeechRecognitionResults& results = event_args.engine_results;
				715	SpeechRecognitionResults::const_iterator i = results.begin();
				716	bool provisional_results_pending = false;
				717	bool results_are_empty = true;
				718	for (; i != results.end(); ++i) {
				719	const SpeechRecognitionResult& result = *i;
				720	if (result.is_provisional) {
				721	provisional_results_pending = true;
				722	DCHECK(!is_single_shot_);
				723	} else if (results_are_empty) {
				724	results_are_empty = result.hypotheses.empty();
				725	}
				726	}
				727
				728	if (provisional_results_pending) {
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	729	listener()->OnRecognitionResults(session_id(), results);
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	730	// We don't end the recognition if a provisional result is received in
				731	// STATE_WAITING_FINAL_RESULT. A definitive result will come next and will
				732	// end the recognition.
				733	return state_;
				734	}
				735
				736	recognition_engine_->EndRecognition();
				737
				738	if (!results_are_empty) {
				739	// We could receive an empty result (which we won't propagate further)
				740	// in the following (continuous) scenario:
				741	// 1. The caller start pushing audio and receives some results;
				742	// 2. A \|StopAudioCapture\| is issued later;
				743	// 3. The final audio frames captured in the interval ]1,2] do not lead to
				744	// any result (nor any error);
				745	// 4. The speech recognition engine, therefore, emits an empty result to
				746	// notify that the recognition is ended with no error, yet neither any
				747	// further result.
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	748	listener()->OnRecognitionResults(session_id(), results);
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	749	}
				750
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	751	listener()->OnRecognitionEnd(session_id());
				752	return STATE_ENDED;
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	753	}
				754
				755	SpeechRecognizerImpl::FSMState
				756	SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {
				757	return state_; // Just keep the current state.
				758	}
				759
				760	SpeechRecognizerImpl::FSMState
				761	SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {
				762	NOTREACHED() << "Unfeasible event " << event_args.event
				763	<< " in state " << state_;
				764	return state_;
				765	}
				766
				767	void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {
				768	DCHECK(IsCapturingAudio());
				769	DVLOG(1) << "SpeechRecognizerImpl closing audio controller.";
				770	// Issues a Close on the audio controller, passing an empty callback. The only
				771	// purpose of such callback is to keep the audio controller refcounted until
				772	// Close has completed (in the audio thread) and automatically destroy it
				773	// afterwards (upon return from OnAudioClosed).
				774	audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed,
				775	this, audio_controller_));
				776	audio_controller_ = NULL; // The controller is still refcounted by Bind.
				777	}
				778
				779	int SpeechRecognizerImpl::GetElapsedTimeMs() const {
				780	return (num_samples_recorded_ * 1000) / kAudioSampleRate;
				781	}
				782
				783	void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms,
				784	bool clip_detected) {
				785	// Calculate the input volume to display in the UI, smoothing towards the
				786	// new level.
				787	// TODO(primiano): Do we really need all this floating point arith here?
				788	// Perhaps it might be quite expensive on mobile.
				789	float level = (rms - kAudioMeterMinDb) /
				790	(kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
				791	level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
				792	const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor :
				793	kDownSmoothingFactor;
				794	audio_level_ += (level - audio_level_) * smoothing_factor;
				795
				796	float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
				797	(kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
				798	noise_level = std::min(std::max(0.0f, noise_level),
				799	kAudioMeterRangeMaxUnclipped);
				800
Torne (Richard Coles)	868fa2f	2013-06-11 10:57:03 +0100	[diff] [blame]	801	listener()->OnAudioLevelsChange(
				802	session_id(), clip_detected ? 1.0f : audio_level_, noise_level);
Torne (Richard Coles)	90dce4d	2013-05-29 14:40:03 +0100	[diff] [blame]	803	}
				804
				805	void SpeechRecognizerImpl::SetAudioManagerForTests(
				806	AudioManager* audio_manager) {
				807	audio_manager_for_tests_ = audio_manager;
				808	}
				809
				810	SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
				811	: event(event_value),
				812	audio_data(NULL),
				813	engine_error(SPEECH_RECOGNITION_ERROR_NONE) {
				814	}
				815
				816	SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {
				817	}
				818
				819	} // namespace content