Blame - docs/dyn/speech_v1p1beta1.speech.html - platform/external/python/google-api-python-client

2019-06-14 16:50:42 -0700

[diff] [blame]

115

# process the request.

116

# request.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

117

"encoding": "A String", # Encoding of audio data sent in all `RecognitionAudio` messages.

118

# This field is optional for `FLAC` and `WAV` audio files and required

119

# for all other audio formats. For details, see AudioEncoding.

120

"audioChannelCount": 42, # The number of channels in the input audio data.

121

# ONLY set this for MULTI-CHANNEL recognition.

122

# Valid values for LINEAR16 and FLAC are `1`-`8`.

123

# Valid values for OGG_OPUS are '1'-'254'.

124

# Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.

125

# If `0` or omitted, defaults to one channel (mono).

126

# Note: We only recognize the first channel by default.

127

# To perform independent recognition on each channel set

128

# `enable_separate_recognition_per_channel` to 'true'.

129

"languageCode": "A String", # Required. The language of the supplied audio as a

130

# [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.

131

# Example: "en-US".

132

# See [Language

133

# Support](https://cloud.google.com/speech-to-text/docs/languages) for a list

134

# of the currently supported language codes.

135

"metadata": { # Description of audio data to be recognized. # Metadata regarding this request.

136

"audioTopic": "A String", # Description of the content. Eg. "Recordings of federal supreme court

137

# hearings from 2012".

138

"originalMediaType": "A String", # The original media the speech was recorded on.

139

"interactionType": "A String", # The use case most closely describing the audio content to be recognized.

140

"recordingDeviceName": "A String", # The device used to make the recording. Examples 'Nexus 5X' or

141

# 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or

142

# 'Cardioid Microphone'.

143

"microphoneDistance": "A String", # The audio type that most closely describes the audio being recognized.

144

"industryNaicsCodeOfAudio": 42, # The industry vertical to which this speech recognition request most

145

# closely applies. This is most indicative of the topics contained

146

# in the audio. Use the 6-digit NAICS code to identify the industry

147

# vertical - see https://www.naics.com/search/.

148

"recordingDeviceType": "A String", # The type of device the speech was recorded with.

149

"originalMimeType": "A String", # Mime type of the original audio file. For example `audio/m4a`,

150

# `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.

151

# A list of possible audio mime types is maintained at

152

# http://www.iana.org/assignments/media-types/media-types.xhtml#audio

153

"obfuscatedId": "A String", # Obfuscated (privacy-protected) ID of the user, to identify number of

154

# unique users using the service.

155

},

156

"maxAlternatives": 42, # Maximum number of recognition hypotheses to be returned.

157

# Specifically, the maximum number of `SpeechRecognitionAlternative` messages

158

# within each `SpeechRecognitionResult`.

159

# The server may return fewer than `max_alternatives`.

160

# Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of

161

# one. If omitted, will return a maximum of one.

162

"profanityFilter": True or False, # If set to `true`, the server will attempt to filter out

163

# profanities, replacing all but the initial character in each filtered word

164

# with asterisks, e.g. "f***". If set to `false` or omitted, profanities

165

# won't be filtered out.

166

"sampleRateHertz": 42, # Sample rate in Hertz of the audio data sent in all

167

# `RecognitionAudio` messages. Valid values are: 8000-48000.

168

# 16000 is optimal. For best results, set the sampling rate of the audio

169

# source to 16000 Hz. If that's not possible, use the native sample rate of

170

# the audio source (instead of re-sampling).

171

# This field is optional for FLAC and WAV audio files, but is

172

# required for all other audio formats. For details, see AudioEncoding.

173

"diarizationSpeakerCount": 42, # If set, specifies the estimated number of speakers in the conversation.

174

# Defaults to '2'. Ignored unless enable_speaker_diarization is set to true.

175

# Note: Use diarization_config instead.

176

"enableWordConfidence": True or False, # If `true`, the top result includes a list of words and the

177

# confidence for those words. If `false`, no word-level confidence

178

# information is returned. The default is `false`.

179

"speechContexts": [ # Array of SpeechContext.

180

# A means to provide context to assist the speech recognition. For more

181

# information, see

182

# [speech

183

# adaptation](https://cloud.google.com/speech-to-text/docs/context-strength).

184

{ # Provides "hints" to the speech recognizer to favor specific words and phrases

185

# in the results.

186

"boost": 3.14, # Hint Boost. Positive value will increase the probability that a specific

187

# phrase will be recognized over other similar sounding phrases. The higher

188

# the boost, the higher the chance of false positive recognition as well.

189

# Negative boost values would correspond to anti-biasing. Anti-biasing is not

190

# enabled, so negative boost will simply be ignored. Though `boost` can

191

# accept a wide range of positive values, most use cases are best served with

192

# values between 0 and 20. We recommend using a binary search approach to

193

# finding the optimal value for your use case.

194

"phrases": [ # A list of strings containing words and phrases "hints" so that

195

# the speech recognition is more likely to recognize them. This can be used

196

# to improve the accuracy for specific words and phrases, for example, if

197

# specific commands are typically spoken by the user. This can also be used

198

# to add additional words to the vocabulary of the recognizer. See

199

# [usage limits](https://cloud.google.com/speech-to-text/quotas#content).

200

#

201

# List items can also be set to classes for groups of words that represent

202

# common concepts that occur in natural language. For example, rather than

203

# providing phrase hints for every month of the year, using the $MONTH class

204

# improves the likelihood of correctly transcribing audio that includes

# months.

"A String",

],

},

],

"enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in

211

# the top alternative of the recognition result using a speaker_tag provided

212

# in the WordInfo.

213

# Note: Use diarization_config instead.

214

"enableAutomaticPunctuation": True or False, # If 'true', adds punctuation to recognition result hypotheses.

215

# This feature is only available in select languages. Setting this for

216

# requests in other languages has no effect at all.

217

# The default 'false' value does not add punctuation to result hypotheses.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

218

"enableWordTimeOffsets": True or False, # If `true`, the top result includes a list of words and

219

# the start and end time offsets (timestamps) for those words. If

220

# `false`, no word-level time offset information is returned. The default is

221

# `false`.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

222

"diarizationConfig": { # Config to enable speaker diarization. # Config to enable speaker diarization and set additional

223

# parameters to make diarization better suited for your application.

224

# Note: When this is enabled, we send all the words from the beginning of the

225

# audio for the top alternative in every consecutive STREAMING responses.

226

# This is done in order to improve our speaker tags as our models learn to

227

# identify the speakers in the conversation over time.

228

# For non-streaming requests, the diarization results will be provided only

229

# in the top alternative of the FINAL SpeechRecognitionResult.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

230

"maxSpeakerCount": 42, # Maximum number of speakers in the conversation. This range gives you more

231

# flexibility by allowing the system to automatically determine the correct

232

# number of speakers. If not set, the default value is 6.

233

"speakerTag": 42, # Output only. Unused.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

234

"minSpeakerCount": 42, # Minimum number of speakers in the conversation. This range gives you more

235

# flexibility by allowing the system to automatically determine the correct

236

# number of speakers. If not set, the default value is 2.

237

"enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in

238

# the top alternative of the recognition result using a speaker_tag provided

239

# in the WordInfo.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

240

},

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

241

"adaptation": { # Speech adaptation configuration. # Speech adaptation configuration improves the accuracy of speech

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

242

# recognition. When speech adaptation is set it supersedes the

243

# `speech_contexts` field. For more information, see the [speech

244

# adaptation](https://cloud.google.com/speech-to-text/docs/context-strength)

245

# documentation.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

246

"phraseSets": [ # A collection of phrase sets. To specify the hints inline, leave the

247

# phrase set's `name` blank and fill in the rest of its fields. Any

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

248

# phrase set can use any custom class.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

249

{ # Provides "hints" to the speech recognizer to favor specific words and phrases

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

250

# in the results.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

251

"boost": 3.14, # Hint Boost. Positive value will increase the probability that a specific

252

# phrase will be recognized over other similar sounding phrases. The higher

253

# the boost, the higher the chance of false positive recognition as well.

254

# Negative boost values would correspond to anti-biasing. Anti-biasing is not

255

# enabled, so negative boost will simply be ignored. Though `boost` can

256

# accept a wide range of positive values, most use cases are best served with

257

# values between 0 (exclusive) and 20. We recommend using a binary search

258

# approach to finding the optimal value for your use case. Speech recognition

259

# will skip PhraseSets with a boost value of 0.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

260

"name": "A String", # The resource name of the phrase set.

261

"phrases": [ # A list of word and phrases.

262

{ # A phrases containing words and phrase "hints" so that

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

263

# the speech recognition is more likely to recognize them. This can be used

264

# to improve the accuracy for specific words and phrases, for example, if

265

# specific commands are typically spoken by the user. This can also be used

266

# to add additional words to the vocabulary of the recognizer. See

267

# [usage limits](https://cloud.google.com/speech-to-text/quotas#content).

268

#

269

# List items can also include pre-built or custom classes containing groups

270

# of words that represent common concepts that occur in natural language. For

271

# example, rather than providing a phrase hint for every month of the

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

272

# year (e.g. "i was born in january", "i was born in febuary", ...), use the

273

# pre-built `$MONTH` class improves the likelihood of correctly transcribing

274

# audio that includes months (e.g. "i was born in $month").

275

# To refer to pre-built classes, use the class' symbol prepended with `$`

276

# e.g. `$MONTH`. To refer to custom classes that were defined inline in the

277

# request, set the class's `custom_class_id` to a string unique to all class

278

# resources and inline classes. Then use the class' id wrapped in $`{...}`

279

# e.g. "${my-months}". To refer to custom classes resources, use the class'

280

# id wrapped in `${}` (e.g. `${my-months}`).

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

281

"boost": 3.14, # Hint Boost. Overrides the boost set at the phrase set level.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

282

# Positive value will increase the probability that a specific phrase will

283

# be recognized over other similar sounding phrases. The higher the boost,

284

# the higher the chance of false positive recognition as well. Negative

285

# boost values would correspond to anti-biasing. Anti-biasing is not

286

# enabled, so negative boost will simply be ignored. Though `boost` can

287

# accept a wide range of positive values, most use cases are best served

288

# with values between 0 and 20. We recommend using a binary search approach

289

# to finding the optimal value for your use case. Speech recognition

290

# will skip PhraseSets with a boost value of 0.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

291

"value": "A String", # The phrase itself.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

292

},

293

],

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

294

},

295

],

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

296

"customClasses": [ # A collection of custom classes. To specify the classes inline, leave the

297

# class' `name` blank and fill in the rest of its fields, giving it a unique

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

298

# `custom_class_id`. Refer to the inline defined class in phrase hints by its

299

# `custom_class_id`.

300

{ # A set of words or phrases that represents a common concept likely to appear

301

# in your audio, for example a list of passenger ship names. CustomClass items

302

# can be substituted into placeholders that you set in PhraseSet phrases.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

303

"items": [ # A collection of class items.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

304

{ # An item of the class.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

305

"value": "A String", # The class item's value.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

306

},

307

],

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

308

"customClassId": "A String", # If this custom class is a resource, the custom_class_id is the resource id

309

# of the CustomClass. Case sensitive.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

310

"name": "A String", # The resource name of the custom class.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

311

},

312

],

313

},

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

314

"model": "A String", # Which model to select for the given request. Select the model

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

315

# best suited to your domain to get best results. If a model is not

316

# explicitly specified, then we auto-select a model based on the parameters

317

# in the RecognitionConfig.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

318

# <table>

319

# <tr>

320

# <td><b>Model</b></td>

321

# <td><b>Description</b></td>

322

# </tr>

323

# <tr>

324

# <td><code>command_and_search</code></td>

325

# <td>Best for short queries such as voice commands or voice search.</td>

326

# </tr>

327

# <tr>

328

# <td><code>phone_call</code></td>

329

# <td>Best for audio that originated from a phone call (typically

330

# recorded at an 8khz sampling rate).</td>

331

# </tr>

332

# <tr>

333

# <td><code>video</code></td>

334

# <td>Best for audio that originated from from video or includes multiple

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

335

# speakers. Ideally the audio is recorded at a 16khz or greater

336

# sampling rate. This is a premium model that costs more than the

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

337

# standard rate.</td>

338

# </tr>

339

# <tr>

340

# <td><code>default</code></td>

341

# <td>Best for audio that is not one of the specific audio models.

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

342

# For example, long-form audio. Ideally the audio is high-fidelity,

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

343

# recorded at a 16khz or greater sampling rate.</td>

344

# </tr>

345

# </table>

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

346

"useEnhanced": True or False, # Set to true to use an enhanced model for speech recognition.

347

# If `use_enhanced` is set to true and the `model` field is not set, then

348

# an appropriate enhanced model is chosen if an enhanced model exists for

349

# the audio.

350

#

351

# If `use_enhanced` is true and an enhanced version of the specified model

352

# does not exist, then the speech is recognized using the standard version

353

# of the specified model.

354

"alternativeLanguageCodes": [ # A list of up to 3 additional

355

# [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,

356

# listing possible alternative languages of the supplied audio.

357

# See [Language

358

# Support](https://cloud.google.com/speech-to-text/docs/languages) for a list

359

# of the currently supported language codes. If alternative languages are

360

# listed, recognition result will contain recognition in the most likely

361

# language detected including the main language_code. The recognition result

362

# will include the language tag of the language detected in the audio. Note:

363

# This feature is only supported for Voice Command and Voice Search use cases

364

# and performance may vary for other use cases (e.g., phone call

# transcription).

"A String",

],

"enableSeparateRecognitionPerChannel": True or False, # This needs to be set to `true` explicitly and `audio_channel_count` > 1

369

# to get each channel recognized separately. The recognition result will

370

# contain a `channel_tag` field to state which channel that result belongs

371

# to. If this is not true, we will only recognize the first channel. The

372

# request is billed cumulatively for all channels recognized:

373

# `audio_channel_count` multiplied by the length of the audio.

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

},

}

x__xgafv: string, V1 error format.

Allowed values

1 - v1 error format

2 - v2 error format

Returns:

An object of the form:

384

385

{ # This resource represents a long-running operation that is the result of a

386

# network API call.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

387

"done": True or False, # If the value is `false`, it means the operation is still in progress.

388

# If `true`, the operation is completed, and either `error` or `response` is

389

# available.

390

"metadata": { # Service-specific metadata associated with the operation. It typically

391

# contains progress information and common metadata such as create time.

392

# Some services might not provide such metadata. Any method that returns a

393

# long-running operation should document the metadata type, if any.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

394

"a_key": "", # Properties of the object. Contains field @type with type URL.

395

},

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

396

"error": { # The `Status` type defines a logical error model that is suitable for # The error result of the operation in case of failure or cancellation.

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

397

# different programming environments, including REST APIs and RPC APIs. It is

398

# used by [gRPC](https://github.com/grpc). Each `Status` message contains

399

# three pieces of data: error code, error message, and error details.

400

#

401

# You can find out more about this error model and how to work with it in the

402

# [API Design Guide](https://cloud.google.com/apis/design/errors).

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

403

"code": 42, # The status code, which should be an enum value of google.rpc.Code.

404

"message": "A String", # A developer-facing error message, which should be in English. Any

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

405

# user-facing error message should be localized and sent in the

406

# google.rpc.Status.details field, or localized by the client.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

407

"details": [ # A list of messages that carry the error details. There is a common set of

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

408

# message types for APIs to use.

409

{

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

410

"a_key": "", # Properties of the object. Contains field @type with type URL.

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

411

},

412

],

413

},

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

414

"name": "A String", # The server-assigned name, which is only unique within the same service that

415

# originally returns it. If you use the default HTTP mapping, the

416

# `name` should be a resource name ending with `operations/{unique_id}`.

417

"response": { # The normal response of the operation in case of success. If the original

418

# method returns no data on success, such as `Delete`, the response is

419

# `google.protobuf.Empty`. If the original method is standard

420

# `Get`/`Create`/`Update`, the response should be the resource. For other

421

# methods, the response should have the type `XxxResponse`, where `Xxx`

422

# is the original method name. For example, if the original method name

423

# is `TakeSnapshot()`, the inferred response type is

424

# `TakeSnapshotResponse`.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

425

"a_key": "", # Properties of the object. Contains field @type with type URL.

426

},

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

}</pre>

</div>

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

431

<code class="details" id="recognize">recognize(body=None, x__xgafv=None)</code>

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

432

<pre>Performs synchronous speech recognition: receive results after all audio

433

has been sent and processed.

434

435

Args:

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

436

body: object, The request body.

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

437

The object takes the form of:

438

439

{ # The top-level message sent by the client for the `Recognize` method.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

440

"audio": { # Contains audio data in the encoding specified in the `RecognitionConfig`. # Required. The audio data to be recognized.

441

# Either `content` or `uri` must be supplied. Supplying both or neither

442

# returns google.rpc.Code.INVALID_ARGUMENT. See

443

# [content limits](https://cloud.google.com/speech-to-text/quotas#content).

444

"content": "A String", # The audio data bytes encoded as specified in

445

# `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a

446

# pure binary representation, whereas JSON representations use base64.

447

"uri": "A String", # URI that points to a file that contains audio data bytes as specified in

448

# `RecognitionConfig`. The file must not be compressed (for example, gzip).

449

# Currently, only Google Cloud Storage URIs are

450

# supported, which must be specified in the following format:

451

# `gs://bucket_name/object_name` (other URI formats return

452

# google.rpc.Code.INVALID_ARGUMENT). For more information, see

453

# [Request URIs](https://cloud.google.com/storage/docs/reference-uris).

454

},

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

455

"config": { # Provides information to the recognizer that specifies how to process the # Required. Provides information to the recognizer that specifies how to

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

456

# process the request.

457

# request.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

458

"encoding": "A String", # Encoding of audio data sent in all `RecognitionAudio` messages.

459

# This field is optional for `FLAC` and `WAV` audio files and required

460

# for all other audio formats. For details, see AudioEncoding.

461

"audioChannelCount": 42, # The number of channels in the input audio data.

462

# ONLY set this for MULTI-CHANNEL recognition.

463

# Valid values for LINEAR16 and FLAC are `1`-`8`.

464

# Valid values for OGG_OPUS are '1'-'254'.

465

# Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.

466

# If `0` or omitted, defaults to one channel (mono).

467

# Note: We only recognize the first channel by default.

468

# To perform independent recognition on each channel set

469

# `enable_separate_recognition_per_channel` to 'true'.

470

"languageCode": "A String", # Required. The language of the supplied audio as a

471

# [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.

472

# Example: "en-US".

473

# See [Language

474

# Support](https://cloud.google.com/speech-to-text/docs/languages) for a list

475

# of the currently supported language codes.

476

"metadata": { # Description of audio data to be recognized. # Metadata regarding this request.

477

"audioTopic": "A String", # Description of the content. Eg. "Recordings of federal supreme court

478

# hearings from 2012".

479

"originalMediaType": "A String", # The original media the speech was recorded on.

480

"interactionType": "A String", # The use case most closely describing the audio content to be recognized.

481

"recordingDeviceName": "A String", # The device used to make the recording. Examples 'Nexus 5X' or

482

# 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or

483

# 'Cardioid Microphone'.

484

"microphoneDistance": "A String", # The audio type that most closely describes the audio being recognized.

485

"industryNaicsCodeOfAudio": 42, # The industry vertical to which this speech recognition request most

486

# closely applies. This is most indicative of the topics contained

487

# in the audio. Use the 6-digit NAICS code to identify the industry

488

# vertical - see https://www.naics.com/search/.

489

"recordingDeviceType": "A String", # The type of device the speech was recorded with.

490

"originalMimeType": "A String", # Mime type of the original audio file. For example `audio/m4a`,

491

# `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.

492

# A list of possible audio mime types is maintained at

493

# http://www.iana.org/assignments/media-types/media-types.xhtml#audio

494

"obfuscatedId": "A String", # Obfuscated (privacy-protected) ID of the user, to identify number of

495

# unique users using the service.

496

},

497

"maxAlternatives": 42, # Maximum number of recognition hypotheses to be returned.

498

# Specifically, the maximum number of `SpeechRecognitionAlternative` messages

499

# within each `SpeechRecognitionResult`.

500

# The server may return fewer than `max_alternatives`.

501

# Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of

502

# one. If omitted, will return a maximum of one.

503

"profanityFilter": True or False, # If set to `true`, the server will attempt to filter out

504

# profanities, replacing all but the initial character in each filtered word

505

# with asterisks, e.g. "f***". If set to `false` or omitted, profanities

506

# won't be filtered out.

507

"sampleRateHertz": 42, # Sample rate in Hertz of the audio data sent in all

508

# `RecognitionAudio` messages. Valid values are: 8000-48000.

509

# 16000 is optimal. For best results, set the sampling rate of the audio

510

# source to 16000 Hz. If that's not possible, use the native sample rate of

511

# the audio source (instead of re-sampling).

512

# This field is optional for FLAC and WAV audio files, but is

513

# required for all other audio formats. For details, see AudioEncoding.

514

"diarizationSpeakerCount": 42, # If set, specifies the estimated number of speakers in the conversation.

515

# Defaults to '2'. Ignored unless enable_speaker_diarization is set to true.

516

# Note: Use diarization_config instead.

517

"enableWordConfidence": True or False, # If `true`, the top result includes a list of words and the

518

# confidence for those words. If `false`, no word-level confidence

519

# information is returned. The default is `false`.

520

"speechContexts": [ # Array of SpeechContext.

521

# A means to provide context to assist the speech recognition. For more

522

# information, see

523

# [speech

524

# adaptation](https://cloud.google.com/speech-to-text/docs/context-strength).

525

{ # Provides "hints" to the speech recognizer to favor specific words and phrases

526

# in the results.

527

"boost": 3.14, # Hint Boost. Positive value will increase the probability that a specific

528

# phrase will be recognized over other similar sounding phrases. The higher

529

# the boost, the higher the chance of false positive recognition as well.

530

# Negative boost values would correspond to anti-biasing. Anti-biasing is not

531

# enabled, so negative boost will simply be ignored. Though `boost` can

532

# accept a wide range of positive values, most use cases are best served with

533

# values between 0 and 20. We recommend using a binary search approach to

534

# finding the optimal value for your use case.

535

"phrases": [ # A list of strings containing words and phrases "hints" so that

536

# the speech recognition is more likely to recognize them. This can be used

537

# to improve the accuracy for specific words and phrases, for example, if

538

# specific commands are typically spoken by the user. This can also be used

539

# to add additional words to the vocabulary of the recognizer. See

540

# [usage limits](https://cloud.google.com/speech-to-text/quotas#content).

541

#

542

# List items can also be set to classes for groups of words that represent

543

# common concepts that occur in natural language. For example, rather than

544

# providing phrase hints for every month of the year, using the $MONTH class

545

# improves the likelihood of correctly transcribing audio that includes

# months.

"A String",

],

},

],

"enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in

552

# the top alternative of the recognition result using a speaker_tag provided

553

# in the WordInfo.

554

# Note: Use diarization_config instead.

555

"enableAutomaticPunctuation": True or False, # If 'true', adds punctuation to recognition result hypotheses.

556

# This feature is only available in select languages. Setting this for

557

# requests in other languages has no effect at all.

558

# The default 'false' value does not add punctuation to result hypotheses.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

559

"enableWordTimeOffsets": True or False, # If `true`, the top result includes a list of words and

560

# the start and end time offsets (timestamps) for those words. If

561

# `false`, no word-level time offset information is returned. The default is

562

# `false`.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

563

"diarizationConfig": { # Config to enable speaker diarization. # Config to enable speaker diarization and set additional

564

# parameters to make diarization better suited for your application.

565

# Note: When this is enabled, we send all the words from the beginning of the

566

# audio for the top alternative in every consecutive STREAMING responses.

567

# This is done in order to improve our speaker tags as our models learn to

568

# identify the speakers in the conversation over time.

569

# For non-streaming requests, the diarization results will be provided only

570

# in the top alternative of the FINAL SpeechRecognitionResult.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

571

"maxSpeakerCount": 42, # Maximum number of speakers in the conversation. This range gives you more

572

# flexibility by allowing the system to automatically determine the correct

573

# number of speakers. If not set, the default value is 6.

574

"speakerTag": 42, # Output only. Unused.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

575

"minSpeakerCount": 42, # Minimum number of speakers in the conversation. This range gives you more

576

# flexibility by allowing the system to automatically determine the correct

577

# number of speakers. If not set, the default value is 2.

578

"enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in

579

# the top alternative of the recognition result using a speaker_tag provided

580

# in the WordInfo.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

581

},

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

582

"adaptation": { # Speech adaptation configuration. # Speech adaptation configuration improves the accuracy of speech

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

583

# recognition. When speech adaptation is set it supersedes the

584

# `speech_contexts` field. For more information, see the [speech

585

# adaptation](https://cloud.google.com/speech-to-text/docs/context-strength)

586

# documentation.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

587

"phraseSets": [ # A collection of phrase sets. To specify the hints inline, leave the

588

# phrase set's `name` blank and fill in the rest of its fields. Any

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

589

# phrase set can use any custom class.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

590

{ # Provides "hints" to the speech recognizer to favor specific words and phrases

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

591

# in the results.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

592

"boost": 3.14, # Hint Boost. Positive value will increase the probability that a specific

593

# phrase will be recognized over other similar sounding phrases. The higher

594

# the boost, the higher the chance of false positive recognition as well.

595

# Negative boost values would correspond to anti-biasing. Anti-biasing is not

596

# enabled, so negative boost will simply be ignored. Though `boost` can

597

# accept a wide range of positive values, most use cases are best served with

598

# values between 0 (exclusive) and 20. We recommend using a binary search

599

# approach to finding the optimal value for your use case. Speech recognition

600

# will skip PhraseSets with a boost value of 0.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

601

"name": "A String", # The resource name of the phrase set.

602

"phrases": [ # A list of word and phrases.

603

{ # A phrases containing words and phrase "hints" so that

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

604

# the speech recognition is more likely to recognize them. This can be used

605

# to improve the accuracy for specific words and phrases, for example, if

606

# specific commands are typically spoken by the user. This can also be used

607

# to add additional words to the vocabulary of the recognizer. See

608

# [usage limits](https://cloud.google.com/speech-to-text/quotas#content).

609

#

610

# List items can also include pre-built or custom classes containing groups

611

# of words that represent common concepts that occur in natural language. For

612

# example, rather than providing a phrase hint for every month of the

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

613

# year (e.g. "i was born in january", "i was born in febuary", ...), use the

614

# pre-built `$MONTH` class improves the likelihood of correctly transcribing

615

# audio that includes months (e.g. "i was born in $month").

616

# To refer to pre-built classes, use the class' symbol prepended with `$`

617

# e.g. `$MONTH`. To refer to custom classes that were defined inline in the

618

# request, set the class's `custom_class_id` to a string unique to all class

619

# resources and inline classes. Then use the class' id wrapped in $`{...}`

620

# e.g. "${my-months}". To refer to custom classes resources, use the class'

621

# id wrapped in `${}` (e.g. `${my-months}`).

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

622

"boost": 3.14, # Hint Boost. Overrides the boost set at the phrase set level.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

623

# Positive value will increase the probability that a specific phrase will

624

# be recognized over other similar sounding phrases. The higher the boost,

625

# the higher the chance of false positive recognition as well. Negative

626

# boost values would correspond to anti-biasing. Anti-biasing is not

627

# enabled, so negative boost will simply be ignored. Though `boost` can

628

# accept a wide range of positive values, most use cases are best served

629

# with values between 0 and 20. We recommend using a binary search approach

630

# to finding the optimal value for your use case. Speech recognition

631

# will skip PhraseSets with a boost value of 0.

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

632

"value": "A String", # The phrase itself.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

633

},

634

],

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

635

},

636

],

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

637

"customClasses": [ # A collection of custom classes. To specify the classes inline, leave the

638

# class' `name` blank and fill in the rest of its fields, giving it a unique

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

639

# `custom_class_id`. Refer to the inline defined class in phrase hints by its

640

# `custom_class_id`.

641

{ # A set of words or phrases that represents a common concept likely to appear

642

# in your audio, for example a list of passenger ship names. CustomClass items

643

# can be substituted into placeholders that you set in PhraseSet phrases.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

644

"items": [ # A collection of class items.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

645

{ # An item of the class.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

646

"value": "A String", # The class item's value.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

647

},

648

],

Bu Sun Kim

2020-05-27 12:20:54 -0700

[diff] [blame]

649

"customClassId": "A String", # If this custom class is a resource, the custom_class_id is the resource id

650

# of the CustomClass. Case sensitive.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

651

"name": "A String", # The resource name of the custom class.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

652

},

653

],

654

},

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

655

"model": "A String", # Which model to select for the given request. Select the model

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

656

# best suited to your domain to get best results. If a model is not

657

# explicitly specified, then we auto-select a model based on the parameters

658

# in the RecognitionConfig.

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

659

# <table>

660

# <tr>

661

# <td><b>Model</b></td>

662

# <td><b>Description</b></td>

663

# </tr>

664

# <tr>

665

# <td><code>command_and_search</code></td>

666

# <td>Best for short queries such as voice commands or voice search.</td>

667

# </tr>

668

# <tr>

669

# <td><code>phone_call</code></td>

670

# <td>Best for audio that originated from a phone call (typically

671

# recorded at an 8khz sampling rate).</td>

672

# </tr>

673

# <tr>

674

# <td><code>video</code></td>

675

# <td>Best for audio that originated from from video or includes multiple

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

676

# speakers. Ideally the audio is recorded at a 16khz or greater

677

# sampling rate. This is a premium model that costs more than the

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

678

# standard rate.</td>

679

# </tr>

680

# <tr>

681

# <td><code>default</code></td>

682

# <td>Best for audio that is not one of the specific audio models.

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

683

# For example, long-form audio. Ideally the audio is high-fidelity,

Dan O'Meara

2020-05-01 07:42:23 -0700

[diff] [blame]

684

# recorded at a 16khz or greater sampling rate.</td>

685

# </tr>

686

# </table>

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

687

"useEnhanced": True or False, # Set to true to use an enhanced model for speech recognition.

688

# If `use_enhanced` is set to true and the `model` field is not set, then

689

# an appropriate enhanced model is chosen if an enhanced model exists for

690

# the audio.

691

#

692

# If `use_enhanced` is true and an enhanced version of the specified model

693

# does not exist, then the speech is recognized using the standard version

694

# of the specified model.

695

"alternativeLanguageCodes": [ # A list of up to 3 additional

696

# [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,

697

# listing possible alternative languages of the supplied audio.

698

# See [Language

699

# Support](https://cloud.google.com/speech-to-text/docs/languages) for a list

700

# of the currently supported language codes. If alternative languages are

701

# listed, recognition result will contain recognition in the most likely

702

# language detected including the main language_code. The recognition result

703

# will include the language tag of the language detected in the audio. Note:

704

# This feature is only supported for Voice Command and Voice Search use cases

705

# and performance may vary for other use cases (e.g., phone call

# transcription).

"A String",

],

"enableSeparateRecognitionPerChannel": True or False, # This needs to be set to `true` explicitly and `audio_channel_count` > 1

710

# to get each channel recognized separately. The recognition result will

711

# contain a `channel_tag` field to state which channel that result belongs

712

# to. If this is not true, we will only recognize the first channel. The

713

# request is billed cumulatively for all channels recognized:

714

# `audio_channel_count` multiplied by the length of the audio.

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

715

},

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

716

}

717

718

x__xgafv: string, V1 error format.

Allowed values

1 - v1 error format

2 - v2 error format

Returns:

An object of the form:

725

726

{ # The only message returned to the client by the `Recognize` method. It

727

# contains the result as zero or more sequential `SpeechRecognitionResult`

728

# messages.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

729

"results": [ # Sequential list of transcription results corresponding to

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

730

# sequential portions of audio.

731

{ # A speech recognition result corresponding to a portion of the audio.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

732

"alternatives": [ # May contain one or more recognition hypotheses (up to the

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

733

# maximum specified in `max_alternatives`).

734

# These alternatives are ordered in terms of accuracy, with the top (first)

735

# alternative being the most probable, as ranked by the recognizer.

736

{ # Alternative hypotheses (a.k.a. n-best list).

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

737

"transcript": "A String", # Transcript text representing the words that the user spoke.

738

"words": [ # A list of word-specific information for each recognized word.

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

739

# Note: When `enable_speaker_diarization` is true, you will see all the words

740

# from the beginning of the audio.

741

{ # Word-specific information for recognized words.

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

742

"startTime": "A String", # Time offset relative to the beginning of the audio,

743

# and corresponding to the start of the spoken word.

744

# This field is only set if `enable_word_time_offsets=true` and only

745

# in the top hypothesis.

746

# This is an experimental feature and the accuracy of the time offset can

747

# vary.

748

"speakerTag": 42, # Output only. A distinct integer value is assigned for every speaker within

749

# the audio. This field specifies which one of those speakers was detected to

750

# have spoken this word. Value ranges from '1' to diarization_speaker_count.

751

# speaker_tag is set if enable_speaker_diarization = 'true' and only in the

752

# top alternative.

753

"word": "A String", # The word corresponding to this set of information.

Bu Sun Kim

2020-05-20 12:08:20 -0700

[diff] [blame]

754

"endTime": "A String", # Time offset relative to the beginning of the audio,

755

# and corresponding to the end of the spoken word.

756

# This field is only set if `enable_word_time_offsets=true` and only

757

# in the top hypothesis.

758

# This is an experimental feature and the accuracy of the time offset can

759

# vary.

760

"confidence": 3.14, # The confidence estimate between 0.0 and 1.0. A higher number

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

761

# indicates an estimated greater likelihood that the recognized words are

762

# correct. This field is set only for the top alternative of a non-streaming

763

# result or, of a streaming result where `is_final=true`.

764

# This field is not guaranteed to be accurate and users should not rely on it

765

# to be always provided.

766

# The default of 0.0 is a sentinel value indicating `confidence` was not set.

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

767

},

768

],

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

769

"confidence": 3.14, # The confidence estimate between 0.0 and 1.0. A higher number

770

# indicates an estimated greater likelihood that the recognized words are

771

# correct. This field is set only for the top alternative of a non-streaming

772

# result or, of a streaming result where `is_final=true`.

773

# This field is not guaranteed to be accurate and users should not rely on it

774

# to be always provided.

775

# The default of 0.0 is a sentinel value indicating `confidence` was not set.

Bu Sun Kim

2019-06-14 16:50:42 -0700

[diff] [blame]

776

},

777

],

Bu Sun Kim

2020-07-22 17:02:09 -0700

[diff] [blame]

778

"channelTag": 42, # For multi-channel audio, this is the channel number corresponding to the

779

# recognized result for the audio from that channel.

780

# For audio_channel_count = N, its output values can range from '1' to 'N'.

781

"languageCode": "A String", # Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag

782

# of the language in this result. This language code was detected to have

783

# the most likelihood of being spoken in the audio.

Bu Sun Kim