Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 1 | <html><body> |
| 2 | <style> |
| 3 | |
| 4 | body, h1, h2, h3, div, span, p, pre, a { |
| 5 | margin: 0; |
| 6 | padding: 0; |
| 7 | border: 0; |
| 8 | font-weight: inherit; |
| 9 | font-style: inherit; |
| 10 | font-size: 100%; |
| 11 | font-family: inherit; |
| 12 | vertical-align: baseline; |
| 13 | } |
| 14 | |
| 15 | body { |
| 16 | font-size: 13px; |
| 17 | padding: 1em; |
| 18 | } |
| 19 | |
| 20 | h1 { |
| 21 | font-size: 26px; |
| 22 | margin-bottom: 1em; |
| 23 | } |
| 24 | |
| 25 | h2 { |
| 26 | font-size: 24px; |
| 27 | margin-bottom: 1em; |
| 28 | } |
| 29 | |
| 30 | h3 { |
| 31 | font-size: 20px; |
| 32 | margin-bottom: 1em; |
| 33 | margin-top: 1em; |
| 34 | } |
| 35 | |
| 36 | pre, code { |
| 37 | line-height: 1.5; |
| 38 | font-family: Monaco, 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', 'Lucida Console', monospace; |
| 39 | } |
| 40 | |
| 41 | pre { |
| 42 | margin-top: 0.5em; |
| 43 | } |
| 44 | |
| 45 | h1, h2, h3, p { |
| 46 | font-family: Arial, sans serif; |
| 47 | } |
| 48 | |
| 49 | h1, h2, h3 { |
| 50 | border-bottom: solid #CCC 1px; |
| 51 | } |
| 52 | |
| 53 | .toc_element { |
| 54 | margin-top: 0.5em; |
| 55 | } |
| 56 | |
| 57 | .firstline { |
| 58 | margin-left: 2 em; |
| 59 | } |
| 60 | |
| 61 | .method { |
| 62 | margin-top: 1em; |
| 63 | border: solid 1px #CCC; |
| 64 | padding: 1em; |
| 65 | background: #EEE; |
| 66 | } |
| 67 | |
| 68 | .details { |
| 69 | font-weight: bold; |
| 70 | font-size: 14px; |
| 71 | } |
| 72 | |
| 73 | </style> |
| 74 | |
| 75 | <h1><a href="speech_v1p1beta1.html">Cloud Speech-to-Text API</a> . <a href="speech_v1p1beta1.speech.html">speech</a></h1> |
| 76 | <h2>Instance Methods</h2> |
| 77 | <p class="toc_element"> |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 78 | <code><a href="#longrunningrecognize">longrunningrecognize(body=None, x__xgafv=None)</a></code></p> |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 79 | <p class="firstline">Performs asynchronous speech recognition: receive results via the</p> |
| 80 | <p class="toc_element"> |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 81 | <code><a href="#recognize">recognize(body=None, x__xgafv=None)</a></code></p> |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 82 | <p class="firstline">Performs synchronous speech recognition: receive results after all audio</p> |
| 83 | <h3>Method Details</h3> |
| 84 | <div class="method"> |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 85 | <code class="details" id="longrunningrecognize">longrunningrecognize(body=None, x__xgafv=None)</code> |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 86 | <pre>Performs asynchronous speech recognition: receive results via the |
| 87 | google.longrunning.Operations interface. Returns either an |
| 88 | `Operation.error` or an `Operation.response` which contains |
| 89 | a `LongRunningRecognizeResponse` message. |
| 90 | For more information on asynchronous speech recognition, see the |
| 91 | [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize). |
| 92 | |
| 93 | Args: |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 94 | body: object, The request body. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 95 | The object takes the form of: |
| 96 | |
| 97 | { # The top-level message sent by the client for the `LongRunningRecognize` |
| 98 | # method. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 99 | "config": { # Provides information to the recognizer that specifies how to process the # Required. Provides information to the recognizer that specifies how to |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 100 | # process the request. |
| 101 | # request. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 102 | "metadata": { # Description of audio data to be recognized. # Metadata regarding this request. |
| 103 | "originalMediaType": "A String", # The original media the speech was recorded on. |
| 104 | "obfuscatedId": "A String", # Obfuscated (privacy-protected) ID of the user, to identify number of |
| 105 | # unique users using the service. |
| 106 | "recordingDeviceType": "A String", # The type of device the speech was recorded with. |
| 107 | "interactionType": "A String", # The use case most closely describing the audio content to be recognized. |
| 108 | "recordingDeviceName": "A String", # The device used to make the recording. Examples 'Nexus 5X' or |
| 109 | # 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or |
| 110 | # 'Cardioid Microphone'. |
| 111 | "originalMimeType": "A String", # Mime type of the original audio file. For example `audio/m4a`, |
| 112 | # `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`. |
| 113 | # A list of possible audio mime types is maintained at |
| 114 | # http://www.iana.org/assignments/media-types/media-types.xhtml#audio |
| 115 | "audioTopic": "A String", # Description of the content. Eg. "Recordings of federal supreme court |
| 116 | # hearings from 2012". |
| 117 | "industryNaicsCodeOfAudio": 42, # The industry vertical to which this speech recognition request most |
| 118 | # closely applies. This is most indicative of the topics contained |
| 119 | # in the audio. Use the 6-digit NAICS code to identify the industry |
| 120 | # vertical - see https://www.naics.com/search/. |
| 121 | "microphoneDistance": "A String", # The audio type that most closely describes the audio being recognized. |
| 122 | }, |
| 123 | "sampleRateHertz": 42, # Sample rate in Hertz of the audio data sent in all |
| 124 | # `RecognitionAudio` messages. Valid values are: 8000-48000. |
| 125 | # 16000 is optimal. For best results, set the sampling rate of the audio |
| 126 | # source to 16000 Hz. If that's not possible, use the native sample rate of |
| 127 | # the audio source (instead of re-sampling). |
| 128 | # This field is optional for FLAC and WAV audio files, but is |
| 129 | # required for all other audio formats. For details, see AudioEncoding. |
| 130 | "enableSeparateRecognitionPerChannel": True or False, # This needs to be set to `true` explicitly and `audio_channel_count` > 1 |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 131 | # to get each channel recognized separately. The recognition result will |
| 132 | # contain a `channel_tag` field to state which channel that result belongs |
| 133 | # to. If this is not true, we will only recognize the first channel. The |
| 134 | # request is billed cumulatively for all channels recognized: |
| 135 | # `audio_channel_count` multiplied by the length of the audio. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 136 | "enableAutomaticPunctuation": True or False, # If 'true', adds punctuation to recognition result hypotheses. |
| 137 | # This feature is only available in select languages. Setting this for |
| 138 | # requests in other languages has no effect at all. |
| 139 | # The default 'false' value does not add punctuation to result hypotheses. |
| 140 | "adaptation": { # Speech adaptation configuration. # Speech adaptation configuration improves the accuracy of speech |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 141 | # recognition. When speech adaptation is set it supersedes the |
| 142 | # `speech_contexts` field. For more information, see the [speech |
| 143 | # adaptation](https://cloud.google.com/speech-to-text/docs/context-strength) |
| 144 | # documentation. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 145 | "phraseSets": [ # A collection of phrase sets. To specify the hints inline, leave the |
| 146 | # phrase set's `name` blank and fill in the rest of its fields. Any |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 147 | # phrase set can use any custom class. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 148 | { # Provides "hints" to the speech recognizer to favor specific words and phrases |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 149 | # in the results. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 150 | "boost": 3.14, # Hint Boost. Positive value will increase the probability that a specific |
| 151 | # phrase will be recognized over other similar sounding phrases. The higher |
| 152 | # the boost, the higher the chance of false positive recognition as well. |
| 153 | # Negative boost values would correspond to anti-biasing. Anti-biasing is not |
| 154 | # enabled, so negative boost will simply be ignored. Though `boost` can |
| 155 | # accept a wide range of positive values, most use cases are best served with |
| 156 | # values between 0 (exclusive) and 20. We recommend using a binary search |
| 157 | # approach to finding the optimal value for your use case. Speech recognition |
| 158 | # will skip PhraseSets with a boost value of 0. |
| 159 | "name": "A String", # The resource name of the phrase set. |
| 160 | "phrases": [ # A list of word and phrases. |
| 161 | { # A phrases containing words and phrase "hints" so that |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 162 | # the speech recognition is more likely to recognize them. This can be used |
| 163 | # to improve the accuracy for specific words and phrases, for example, if |
| 164 | # specific commands are typically spoken by the user. This can also be used |
| 165 | # to add additional words to the vocabulary of the recognizer. See |
| 166 | # [usage limits](https://cloud.google.com/speech-to-text/quotas#content). |
| 167 | # |
| 168 | # List items can also include pre-built or custom classes containing groups |
| 169 | # of words that represent common concepts that occur in natural language. For |
| 170 | # example, rather than providing a phrase hint for every month of the |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 171 | # year (e.g. "i was born in january", "i was born in febuary", ...), use the |
| 172 | # pre-built `$MONTH` class improves the likelihood of correctly transcribing |
| 173 | # audio that includes months (e.g. "i was born in $month"). |
| 174 | # To refer to pre-built classes, use the class' symbol prepended with `$` |
| 175 | # e.g. `$MONTH`. To refer to custom classes that were defined inline in the |
| 176 | # request, set the class's `custom_class_id` to a string unique to all class |
| 177 | # resources and inline classes. Then use the class' id wrapped in $`{...}` |
| 178 | # e.g. "${my-months}". To refer to custom classes resources, use the class' |
| 179 | # id wrapped in `${}` (e.g. `${my-months}`). |
| 180 | "value": "A String", # The phrase itself. |
| 181 | "boost": 3.14, # Hint Boost. Overrides the boost set at the phrase set level. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 182 | # Positive value will increase the probability that a specific phrase will |
| 183 | # be recognized over other similar sounding phrases. The higher the boost, |
| 184 | # the higher the chance of false positive recognition as well. Negative |
| 185 | # boost values would correspond to anti-biasing. Anti-biasing is not |
| 186 | # enabled, so negative boost will simply be ignored. Though `boost` can |
| 187 | # accept a wide range of positive values, most use cases are best served |
| 188 | # with values between 0 and 20. We recommend using a binary search approach |
| 189 | # to finding the optimal value for your use case. Speech recognition |
| 190 | # will skip PhraseSets with a boost value of 0. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 191 | }, |
| 192 | ], |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 193 | }, |
| 194 | ], |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 195 | "customClasses": [ # A collection of custom classes. To specify the classes inline, leave the |
| 196 | # class' `name` blank and fill in the rest of its fields, giving it a unique |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 197 | # `custom_class_id`. Refer to the inline defined class in phrase hints by its |
| 198 | # `custom_class_id`. |
| 199 | { # A set of words or phrases that represents a common concept likely to appear |
| 200 | # in your audio, for example a list of passenger ship names. CustomClass items |
| 201 | # can be substituted into placeholders that you set in PhraseSet phrases. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 202 | "name": "A String", # The resource name of the custom class. |
| 203 | "customClassId": "A String", # If this custom class is a resource, the custom_class_id is the resource id |
| 204 | # of the CustomClass. Case sensitive. |
| 205 | "items": [ # A collection of class items. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 206 | { # An item of the class. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 207 | "value": "A String", # The class item's value. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 208 | }, |
| 209 | ], |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 210 | }, |
| 211 | ], |
| 212 | }, |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 213 | "maxAlternatives": 42, # Maximum number of recognition hypotheses to be returned. |
| 214 | # Specifically, the maximum number of `SpeechRecognitionAlternative` messages |
| 215 | # within each `SpeechRecognitionResult`. |
| 216 | # The server may return fewer than `max_alternatives`. |
| 217 | # Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of |
| 218 | # one. If omitted, will return a maximum of one. |
| 219 | "diarizationSpeakerCount": 42, # If set, specifies the estimated number of speakers in the conversation. |
| 220 | # Defaults to '2'. Ignored unless enable_speaker_diarization is set to true. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 221 | # Note: Use diarization_config instead. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 222 | "encoding": "A String", # Encoding of audio data sent in all `RecognitionAudio` messages. |
| 223 | # This field is optional for `FLAC` and `WAV` audio files and required |
| 224 | # for all other audio formats. For details, see AudioEncoding. |
| 225 | "speechContexts": [ # Array of SpeechContext. |
| 226 | # A means to provide context to assist the speech recognition. For more |
| 227 | # information, see |
| 228 | # [speech |
| 229 | # adaptation](https://cloud.google.com/speech-to-text/docs/context-strength). |
| 230 | { # Provides "hints" to the speech recognizer to favor specific words and phrases |
| 231 | # in the results. |
| 232 | "phrases": [ # A list of strings containing words and phrases "hints" so that |
| 233 | # the speech recognition is more likely to recognize them. This can be used |
| 234 | # to improve the accuracy for specific words and phrases, for example, if |
| 235 | # specific commands are typically spoken by the user. This can also be used |
| 236 | # to add additional words to the vocabulary of the recognizer. See |
| 237 | # [usage limits](https://cloud.google.com/speech-to-text/quotas#content). |
| 238 | # |
| 239 | # List items can also be set to classes for groups of words that represent |
| 240 | # common concepts that occur in natural language. For example, rather than |
| 241 | # providing phrase hints for every month of the year, using the $MONTH class |
| 242 | # improves the likelihood of correctly transcribing audio that includes |
| 243 | # months. |
| 244 | "A String", |
| 245 | ], |
| 246 | "boost": 3.14, # Hint Boost. Positive value will increase the probability that a specific |
| 247 | # phrase will be recognized over other similar sounding phrases. The higher |
| 248 | # the boost, the higher the chance of false positive recognition as well. |
| 249 | # Negative boost values would correspond to anti-biasing. Anti-biasing is not |
| 250 | # enabled, so negative boost will simply be ignored. Though `boost` can |
| 251 | # accept a wide range of positive values, most use cases are best served with |
| 252 | # values between 0 and 20. We recommend using a binary search approach to |
| 253 | # finding the optimal value for your use case. |
| 254 | }, |
| 255 | ], |
| 256 | "enableWordConfidence": True or False, # If `true`, the top result includes a list of words and the |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 257 | # confidence for those words. If `false`, no word-level confidence |
| 258 | # information is returned. The default is `false`. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 259 | "model": "A String", # Which model to select for the given request. Select the model |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 260 | # best suited to your domain to get best results. If a model is not |
| 261 | # explicitly specified, then we auto-select a model based on the parameters |
| 262 | # in the RecognitionConfig. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 263 | # <table> |
| 264 | # <tr> |
| 265 | # <td><b>Model</b></td> |
| 266 | # <td><b>Description</b></td> |
| 267 | # </tr> |
| 268 | # <tr> |
| 269 | # <td><code>command_and_search</code></td> |
| 270 | # <td>Best for short queries such as voice commands or voice search.</td> |
| 271 | # </tr> |
| 272 | # <tr> |
| 273 | # <td><code>phone_call</code></td> |
| 274 | # <td>Best for audio that originated from a phone call (typically |
| 275 | # recorded at an 8khz sampling rate).</td> |
| 276 | # </tr> |
| 277 | # <tr> |
| 278 | # <td><code>video</code></td> |
| 279 | # <td>Best for audio that originated from from video or includes multiple |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 280 | # speakers. Ideally the audio is recorded at a 16khz or greater |
| 281 | # sampling rate. This is a premium model that costs more than the |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 282 | # standard rate.</td> |
| 283 | # </tr> |
| 284 | # <tr> |
| 285 | # <td><code>default</code></td> |
| 286 | # <td>Best for audio that is not one of the specific audio models. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 287 | # For example, long-form audio. Ideally the audio is high-fidelity, |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 288 | # recorded at a 16khz or greater sampling rate.</td> |
| 289 | # </tr> |
| 290 | # </table> |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 291 | "audioChannelCount": 42, # The number of channels in the input audio data. |
| 292 | # ONLY set this for MULTI-CHANNEL recognition. |
| 293 | # Valid values for LINEAR16 and FLAC are `1`-`8`. |
| 294 | # Valid values for OGG_OPUS are '1'-'254'. |
| 295 | # Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`. |
| 296 | # If `0` or omitted, defaults to one channel (mono). |
| 297 | # Note: We only recognize the first channel by default. |
| 298 | # To perform independent recognition on each channel set |
| 299 | # `enable_separate_recognition_per_channel` to 'true'. |
| 300 | "enableWordTimeOffsets": True or False, # If `true`, the top result includes a list of words and |
| 301 | # the start and end time offsets (timestamps) for those words. If |
| 302 | # `false`, no word-level time offset information is returned. The default is |
| 303 | # `false`. |
| 304 | "alternativeLanguageCodes": [ # A list of up to 3 additional |
| 305 | # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags, |
| 306 | # listing possible alternative languages of the supplied audio. |
| 307 | # See [Language |
| 308 | # Support](https://cloud.google.com/speech-to-text/docs/languages) for a list |
| 309 | # of the currently supported language codes. If alternative languages are |
| 310 | # listed, recognition result will contain recognition in the most likely |
| 311 | # language detected including the main language_code. The recognition result |
| 312 | # will include the language tag of the language detected in the audio. Note: |
| 313 | # This feature is only supported for Voice Command and Voice Search use cases |
| 314 | # and performance may vary for other use cases (e.g., phone call |
| 315 | # transcription). |
| 316 | "A String", |
| 317 | ], |
| 318 | "diarizationConfig": { # Config to enable speaker diarization. # Config to enable speaker diarization and set additional |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 319 | # parameters to make diarization better suited for your application. |
| 320 | # Note: When this is enabled, we send all the words from the beginning of the |
| 321 | # audio for the top alternative in every consecutive STREAMING responses. |
| 322 | # This is done in order to improve our speaker tags as our models learn to |
| 323 | # identify the speakers in the conversation over time. |
| 324 | # For non-streaming requests, the diarization results will be provided only |
| 325 | # in the top alternative of the FINAL SpeechRecognitionResult. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 326 | "minSpeakerCount": 42, # Minimum number of speakers in the conversation. This range gives you more |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 327 | # flexibility by allowing the system to automatically determine the correct |
| 328 | # number of speakers. If not set, the default value is 2. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 329 | "maxSpeakerCount": 42, # Maximum number of speakers in the conversation. This range gives you more |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 330 | # flexibility by allowing the system to automatically determine the correct |
| 331 | # number of speakers. If not set, the default value is 6. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 332 | "speakerTag": 42, # Output only. Unused. |
| 333 | "enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in |
| 334 | # the top alternative of the recognition result using a speaker_tag provided |
| 335 | # in the WordInfo. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 336 | }, |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 337 | "languageCode": "A String", # Required. The language of the supplied audio as a |
| 338 | # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. |
| 339 | # Example: "en-US". |
| 340 | # See [Language |
| 341 | # Support](https://cloud.google.com/speech-to-text/docs/languages) for a list |
| 342 | # of the currently supported language codes. |
| 343 | "profanityFilter": True or False, # If set to `true`, the server will attempt to filter out |
| 344 | # profanities, replacing all but the initial character in each filtered word |
| 345 | # with asterisks, e.g. "f***". If set to `false` or omitted, profanities |
| 346 | # won't be filtered out. |
| 347 | "enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in |
| 348 | # the top alternative of the recognition result using a speaker_tag provided |
| 349 | # in the WordInfo. |
| 350 | # Note: Use diarization_config instead. |
| 351 | "useEnhanced": True or False, # Set to true to use an enhanced model for speech recognition. |
| 352 | # If `use_enhanced` is set to true and the `model` field is not set, then |
| 353 | # an appropriate enhanced model is chosen if an enhanced model exists for |
| 354 | # the audio. |
| 355 | # |
| 356 | # If `use_enhanced` is true and an enhanced version of the specified model |
| 357 | # does not exist, then the speech is recognized using the standard version |
| 358 | # of the specified model. |
| 359 | }, |
| 360 | "audio": { # Contains audio data in the encoding specified in the `RecognitionConfig`. # Required. The audio data to be recognized. |
| 361 | # Either `content` or `uri` must be supplied. Supplying both or neither |
| 362 | # returns google.rpc.Code.INVALID_ARGUMENT. See |
| 363 | # [content limits](https://cloud.google.com/speech-to-text/quotas#content). |
| 364 | "content": "A String", # The audio data bytes encoded as specified in |
| 365 | # `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a |
| 366 | # pure binary representation, whereas JSON representations use base64. |
| 367 | "uri": "A String", # URI that points to a file that contains audio data bytes as specified in |
| 368 | # `RecognitionConfig`. The file must not be compressed (for example, gzip). |
| 369 | # Currently, only Google Cloud Storage URIs are |
| 370 | # supported, which must be specified in the following format: |
| 371 | # `gs://bucket_name/object_name` (other URI formats return |
| 372 | # google.rpc.Code.INVALID_ARGUMENT). For more information, see |
| 373 | # [Request URIs](https://cloud.google.com/storage/docs/reference-uris). |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 374 | }, |
| 375 | } |
| 376 | |
| 377 | x__xgafv: string, V1 error format. |
| 378 | Allowed values |
| 379 | 1 - v1 error format |
| 380 | 2 - v2 error format |
| 381 | |
| 382 | Returns: |
| 383 | An object of the form: |
| 384 | |
| 385 | { # This resource represents a long-running operation that is the result of a |
| 386 | # network API call. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 387 | "name": "A String", # The server-assigned name, which is only unique within the same service that |
| 388 | # originally returns it. If you use the default HTTP mapping, the |
| 389 | # `name` should be a resource name ending with `operations/{unique_id}`. |
| 390 | "error": { # The `Status` type defines a logical error model that is suitable for # The error result of the operation in case of failure or cancellation. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 391 | # different programming environments, including REST APIs and RPC APIs. It is |
| 392 | # used by [gRPC](https://github.com/grpc). Each `Status` message contains |
| 393 | # three pieces of data: error code, error message, and error details. |
| 394 | # |
| 395 | # You can find out more about this error model and how to work with it in the |
| 396 | # [API Design Guide](https://cloud.google.com/apis/design/errors). |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 397 | "code": 42, # The status code, which should be an enum value of google.rpc.Code. |
| 398 | "message": "A String", # A developer-facing error message, which should be in English. Any |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 399 | # user-facing error message should be localized and sent in the |
| 400 | # google.rpc.Status.details field, or localized by the client. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 401 | "details": [ # A list of messages that carry the error details. There is a common set of |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 402 | # message types for APIs to use. |
| 403 | { |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 404 | "a_key": "", # Properties of the object. Contains field @type with type URL. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 405 | }, |
| 406 | ], |
| 407 | }, |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 408 | "metadata": { # Service-specific metadata associated with the operation. It typically |
| 409 | # contains progress information and common metadata such as create time. |
| 410 | # Some services might not provide such metadata. Any method that returns a |
| 411 | # long-running operation should document the metadata type, if any. |
| 412 | "a_key": "", # Properties of the object. Contains field @type with type URL. |
| 413 | }, |
| 414 | "done": True or False, # If the value is `false`, it means the operation is still in progress. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 415 | # If `true`, the operation is completed, and either `error` or `response` is |
| 416 | # available. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 417 | "response": { # The normal response of the operation in case of success. If the original |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 418 | # method returns no data on success, such as `Delete`, the response is |
| 419 | # `google.protobuf.Empty`. If the original method is standard |
| 420 | # `Get`/`Create`/`Update`, the response should be the resource. For other |
| 421 | # methods, the response should have the type `XxxResponse`, where `Xxx` |
| 422 | # is the original method name. For example, if the original method name |
| 423 | # is `TakeSnapshot()`, the inferred response type is |
| 424 | # `TakeSnapshotResponse`. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 425 | "a_key": "", # Properties of the object. Contains field @type with type URL. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 426 | }, |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 427 | }</pre> |
| 428 | </div> |
| 429 | |
| 430 | <div class="method"> |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 431 | <code class="details" id="recognize">recognize(body=None, x__xgafv=None)</code> |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 432 | <pre>Performs synchronous speech recognition: receive results after all audio |
| 433 | has been sent and processed. |
| 434 | |
| 435 | Args: |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 436 | body: object, The request body. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 437 | The object takes the form of: |
| 438 | |
| 439 | { # The top-level message sent by the client for the `Recognize` method. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 440 | "config": { # Provides information to the recognizer that specifies how to process the # Required. Provides information to the recognizer that specifies how to |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 441 | # process the request. |
| 442 | # request. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 443 | "metadata": { # Description of audio data to be recognized. # Metadata regarding this request. |
| 444 | "originalMediaType": "A String", # The original media the speech was recorded on. |
| 445 | "obfuscatedId": "A String", # Obfuscated (privacy-protected) ID of the user, to identify number of |
| 446 | # unique users using the service. |
| 447 | "recordingDeviceType": "A String", # The type of device the speech was recorded with. |
| 448 | "interactionType": "A String", # The use case most closely describing the audio content to be recognized. |
| 449 | "recordingDeviceName": "A String", # The device used to make the recording. Examples 'Nexus 5X' or |
| 450 | # 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or |
| 451 | # 'Cardioid Microphone'. |
| 452 | "originalMimeType": "A String", # Mime type of the original audio file. For example `audio/m4a`, |
| 453 | # `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`. |
| 454 | # A list of possible audio mime types is maintained at |
| 455 | # http://www.iana.org/assignments/media-types/media-types.xhtml#audio |
| 456 | "audioTopic": "A String", # Description of the content. Eg. "Recordings of federal supreme court |
| 457 | # hearings from 2012". |
| 458 | "industryNaicsCodeOfAudio": 42, # The industry vertical to which this speech recognition request most |
| 459 | # closely applies. This is most indicative of the topics contained |
| 460 | # in the audio. Use the 6-digit NAICS code to identify the industry |
| 461 | # vertical - see https://www.naics.com/search/. |
| 462 | "microphoneDistance": "A String", # The audio type that most closely describes the audio being recognized. |
| 463 | }, |
| 464 | "sampleRateHertz": 42, # Sample rate in Hertz of the audio data sent in all |
| 465 | # `RecognitionAudio` messages. Valid values are: 8000-48000. |
| 466 | # 16000 is optimal. For best results, set the sampling rate of the audio |
| 467 | # source to 16000 Hz. If that's not possible, use the native sample rate of |
| 468 | # the audio source (instead of re-sampling). |
| 469 | # This field is optional for FLAC and WAV audio files, but is |
| 470 | # required for all other audio formats. For details, see AudioEncoding. |
| 471 | "enableSeparateRecognitionPerChannel": True or False, # This needs to be set to `true` explicitly and `audio_channel_count` > 1 |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 472 | # to get each channel recognized separately. The recognition result will |
| 473 | # contain a `channel_tag` field to state which channel that result belongs |
| 474 | # to. If this is not true, we will only recognize the first channel. The |
| 475 | # request is billed cumulatively for all channels recognized: |
| 476 | # `audio_channel_count` multiplied by the length of the audio. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 477 | "enableAutomaticPunctuation": True or False, # If 'true', adds punctuation to recognition result hypotheses. |
| 478 | # This feature is only available in select languages. Setting this for |
| 479 | # requests in other languages has no effect at all. |
| 480 | # The default 'false' value does not add punctuation to result hypotheses. |
| 481 | "adaptation": { # Speech adaptation configuration. # Speech adaptation configuration improves the accuracy of speech |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 482 | # recognition. When speech adaptation is set it supersedes the |
| 483 | # `speech_contexts` field. For more information, see the [speech |
| 484 | # adaptation](https://cloud.google.com/speech-to-text/docs/context-strength) |
| 485 | # documentation. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 486 | "phraseSets": [ # A collection of phrase sets. To specify the hints inline, leave the |
| 487 | # phrase set's `name` blank and fill in the rest of its fields. Any |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 488 | # phrase set can use any custom class. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 489 | { # Provides "hints" to the speech recognizer to favor specific words and phrases |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 490 | # in the results. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 491 | "boost": 3.14, # Hint Boost. Positive value will increase the probability that a specific |
| 492 | # phrase will be recognized over other similar sounding phrases. The higher |
| 493 | # the boost, the higher the chance of false positive recognition as well. |
| 494 | # Negative boost values would correspond to anti-biasing. Anti-biasing is not |
| 495 | # enabled, so negative boost will simply be ignored. Though `boost` can |
| 496 | # accept a wide range of positive values, most use cases are best served with |
| 497 | # values between 0 (exclusive) and 20. We recommend using a binary search |
| 498 | # approach to finding the optimal value for your use case. Speech recognition |
| 499 | # will skip PhraseSets with a boost value of 0. |
| 500 | "name": "A String", # The resource name of the phrase set. |
| 501 | "phrases": [ # A list of word and phrases. |
| 502 | { # A phrases containing words and phrase "hints" so that |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 503 | # the speech recognition is more likely to recognize them. This can be used |
| 504 | # to improve the accuracy for specific words and phrases, for example, if |
| 505 | # specific commands are typically spoken by the user. This can also be used |
| 506 | # to add additional words to the vocabulary of the recognizer. See |
| 507 | # [usage limits](https://cloud.google.com/speech-to-text/quotas#content). |
| 508 | # |
| 509 | # List items can also include pre-built or custom classes containing groups |
| 510 | # of words that represent common concepts that occur in natural language. For |
| 511 | # example, rather than providing a phrase hint for every month of the |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 512 | # year (e.g. "i was born in january", "i was born in febuary", ...), use the |
| 513 | # pre-built `$MONTH` class improves the likelihood of correctly transcribing |
| 514 | # audio that includes months (e.g. "i was born in $month"). |
| 515 | # To refer to pre-built classes, use the class' symbol prepended with `$` |
| 516 | # e.g. `$MONTH`. To refer to custom classes that were defined inline in the |
| 517 | # request, set the class's `custom_class_id` to a string unique to all class |
| 518 | # resources and inline classes. Then use the class' id wrapped in $`{...}` |
| 519 | # e.g. "${my-months}". To refer to custom classes resources, use the class' |
| 520 | # id wrapped in `${}` (e.g. `${my-months}`). |
| 521 | "value": "A String", # The phrase itself. |
| 522 | "boost": 3.14, # Hint Boost. Overrides the boost set at the phrase set level. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 523 | # Positive value will increase the probability that a specific phrase will |
| 524 | # be recognized over other similar sounding phrases. The higher the boost, |
| 525 | # the higher the chance of false positive recognition as well. Negative |
| 526 | # boost values would correspond to anti-biasing. Anti-biasing is not |
| 527 | # enabled, so negative boost will simply be ignored. Though `boost` can |
| 528 | # accept a wide range of positive values, most use cases are best served |
| 529 | # with values between 0 and 20. We recommend using a binary search approach |
| 530 | # to finding the optimal value for your use case. Speech recognition |
| 531 | # will skip PhraseSets with a boost value of 0. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 532 | }, |
| 533 | ], |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 534 | }, |
| 535 | ], |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 536 | "customClasses": [ # A collection of custom classes. To specify the classes inline, leave the |
| 537 | # class' `name` blank and fill in the rest of its fields, giving it a unique |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 538 | # `custom_class_id`. Refer to the inline defined class in phrase hints by its |
| 539 | # `custom_class_id`. |
| 540 | { # A set of words or phrases that represents a common concept likely to appear |
| 541 | # in your audio, for example a list of passenger ship names. CustomClass items |
| 542 | # can be substituted into placeholders that you set in PhraseSet phrases. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 543 | "name": "A String", # The resource name of the custom class. |
| 544 | "customClassId": "A String", # If this custom class is a resource, the custom_class_id is the resource id |
| 545 | # of the CustomClass. Case sensitive. |
| 546 | "items": [ # A collection of class items. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 547 | { # An item of the class. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 548 | "value": "A String", # The class item's value. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 549 | }, |
| 550 | ], |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 551 | }, |
| 552 | ], |
| 553 | }, |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 554 | "maxAlternatives": 42, # Maximum number of recognition hypotheses to be returned. |
| 555 | # Specifically, the maximum number of `SpeechRecognitionAlternative` messages |
| 556 | # within each `SpeechRecognitionResult`. |
| 557 | # The server may return fewer than `max_alternatives`. |
| 558 | # Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of |
| 559 | # one. If omitted, will return a maximum of one. |
| 560 | "diarizationSpeakerCount": 42, # If set, specifies the estimated number of speakers in the conversation. |
| 561 | # Defaults to '2'. Ignored unless enable_speaker_diarization is set to true. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 562 | # Note: Use diarization_config instead. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 563 | "encoding": "A String", # Encoding of audio data sent in all `RecognitionAudio` messages. |
| 564 | # This field is optional for `FLAC` and `WAV` audio files and required |
| 565 | # for all other audio formats. For details, see AudioEncoding. |
| 566 | "speechContexts": [ # Array of SpeechContext. |
| 567 | # A means to provide context to assist the speech recognition. For more |
| 568 | # information, see |
| 569 | # [speech |
| 570 | # adaptation](https://cloud.google.com/speech-to-text/docs/context-strength). |
| 571 | { # Provides "hints" to the speech recognizer to favor specific words and phrases |
| 572 | # in the results. |
| 573 | "phrases": [ # A list of strings containing words and phrases "hints" so that |
| 574 | # the speech recognition is more likely to recognize them. This can be used |
| 575 | # to improve the accuracy for specific words and phrases, for example, if |
| 576 | # specific commands are typically spoken by the user. This can also be used |
| 577 | # to add additional words to the vocabulary of the recognizer. See |
| 578 | # [usage limits](https://cloud.google.com/speech-to-text/quotas#content). |
| 579 | # |
| 580 | # List items can also be set to classes for groups of words that represent |
| 581 | # common concepts that occur in natural language. For example, rather than |
| 582 | # providing phrase hints for every month of the year, using the $MONTH class |
| 583 | # improves the likelihood of correctly transcribing audio that includes |
| 584 | # months. |
| 585 | "A String", |
| 586 | ], |
| 587 | "boost": 3.14, # Hint Boost. Positive value will increase the probability that a specific |
| 588 | # phrase will be recognized over other similar sounding phrases. The higher |
| 589 | # the boost, the higher the chance of false positive recognition as well. |
| 590 | # Negative boost values would correspond to anti-biasing. Anti-biasing is not |
| 591 | # enabled, so negative boost will simply be ignored. Though `boost` can |
| 592 | # accept a wide range of positive values, most use cases are best served with |
| 593 | # values between 0 and 20. We recommend using a binary search approach to |
| 594 | # finding the optimal value for your use case. |
| 595 | }, |
| 596 | ], |
| 597 | "enableWordConfidence": True or False, # If `true`, the top result includes a list of words and the |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 598 | # confidence for those words. If `false`, no word-level confidence |
| 599 | # information is returned. The default is `false`. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 600 | "model": "A String", # Which model to select for the given request. Select the model |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 601 | # best suited to your domain to get best results. If a model is not |
| 602 | # explicitly specified, then we auto-select a model based on the parameters |
| 603 | # in the RecognitionConfig. |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 604 | # <table> |
| 605 | # <tr> |
| 606 | # <td><b>Model</b></td> |
| 607 | # <td><b>Description</b></td> |
| 608 | # </tr> |
| 609 | # <tr> |
| 610 | # <td><code>command_and_search</code></td> |
| 611 | # <td>Best for short queries such as voice commands or voice search.</td> |
| 612 | # </tr> |
| 613 | # <tr> |
| 614 | # <td><code>phone_call</code></td> |
| 615 | # <td>Best for audio that originated from a phone call (typically |
| 616 | # recorded at an 8khz sampling rate).</td> |
| 617 | # </tr> |
| 618 | # <tr> |
| 619 | # <td><code>video</code></td> |
| 620 | # <td>Best for audio that originated from from video or includes multiple |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 621 | # speakers. Ideally the audio is recorded at a 16khz or greater |
| 622 | # sampling rate. This is a premium model that costs more than the |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 623 | # standard rate.</td> |
| 624 | # </tr> |
| 625 | # <tr> |
| 626 | # <td><code>default</code></td> |
| 627 | # <td>Best for audio that is not one of the specific audio models. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 628 | # For example, long-form audio. Ideally the audio is high-fidelity, |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 629 | # recorded at a 16khz or greater sampling rate.</td> |
| 630 | # </tr> |
| 631 | # </table> |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 632 | "audioChannelCount": 42, # The number of channels in the input audio data. |
| 633 | # ONLY set this for MULTI-CHANNEL recognition. |
| 634 | # Valid values for LINEAR16 and FLAC are `1`-`8`. |
| 635 | # Valid values for OGG_OPUS are '1'-'254'. |
| 636 | # Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`. |
| 637 | # If `0` or omitted, defaults to one channel (mono). |
| 638 | # Note: We only recognize the first channel by default. |
| 639 | # To perform independent recognition on each channel set |
| 640 | # `enable_separate_recognition_per_channel` to 'true'. |
| 641 | "enableWordTimeOffsets": True or False, # If `true`, the top result includes a list of words and |
| 642 | # the start and end time offsets (timestamps) for those words. If |
| 643 | # `false`, no word-level time offset information is returned. The default is |
| 644 | # `false`. |
| 645 | "alternativeLanguageCodes": [ # A list of up to 3 additional |
| 646 | # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags, |
| 647 | # listing possible alternative languages of the supplied audio. |
| 648 | # See [Language |
| 649 | # Support](https://cloud.google.com/speech-to-text/docs/languages) for a list |
| 650 | # of the currently supported language codes. If alternative languages are |
| 651 | # listed, recognition result will contain recognition in the most likely |
| 652 | # language detected including the main language_code. The recognition result |
| 653 | # will include the language tag of the language detected in the audio. Note: |
| 654 | # This feature is only supported for Voice Command and Voice Search use cases |
| 655 | # and performance may vary for other use cases (e.g., phone call |
| 656 | # transcription). |
| 657 | "A String", |
| 658 | ], |
| 659 | "diarizationConfig": { # Config to enable speaker diarization. # Config to enable speaker diarization and set additional |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 660 | # parameters to make diarization better suited for your application. |
| 661 | # Note: When this is enabled, we send all the words from the beginning of the |
| 662 | # audio for the top alternative in every consecutive STREAMING responses. |
| 663 | # This is done in order to improve our speaker tags as our models learn to |
| 664 | # identify the speakers in the conversation over time. |
| 665 | # For non-streaming requests, the diarization results will be provided only |
| 666 | # in the top alternative of the FINAL SpeechRecognitionResult. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 667 | "minSpeakerCount": 42, # Minimum number of speakers in the conversation. This range gives you more |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 668 | # flexibility by allowing the system to automatically determine the correct |
| 669 | # number of speakers. If not set, the default value is 2. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 670 | "maxSpeakerCount": 42, # Maximum number of speakers in the conversation. This range gives you more |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 671 | # flexibility by allowing the system to automatically determine the correct |
| 672 | # number of speakers. If not set, the default value is 6. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 673 | "speakerTag": 42, # Output only. Unused. |
| 674 | "enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in |
| 675 | # the top alternative of the recognition result using a speaker_tag provided |
| 676 | # in the WordInfo. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 677 | }, |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 678 | "languageCode": "A String", # Required. The language of the supplied audio as a |
| 679 | # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. |
| 680 | # Example: "en-US". |
| 681 | # See [Language |
| 682 | # Support](https://cloud.google.com/speech-to-text/docs/languages) for a list |
| 683 | # of the currently supported language codes. |
| 684 | "profanityFilter": True or False, # If set to `true`, the server will attempt to filter out |
| 685 | # profanities, replacing all but the initial character in each filtered word |
| 686 | # with asterisks, e.g. "f***". If set to `false` or omitted, profanities |
| 687 | # won't be filtered out. |
| 688 | "enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in |
| 689 | # the top alternative of the recognition result using a speaker_tag provided |
| 690 | # in the WordInfo. |
| 691 | # Note: Use diarization_config instead. |
| 692 | "useEnhanced": True or False, # Set to true to use an enhanced model for speech recognition. |
| 693 | # If `use_enhanced` is set to true and the `model` field is not set, then |
| 694 | # an appropriate enhanced model is chosen if an enhanced model exists for |
| 695 | # the audio. |
| 696 | # |
| 697 | # If `use_enhanced` is true and an enhanced version of the specified model |
| 698 | # does not exist, then the speech is recognized using the standard version |
| 699 | # of the specified model. |
| 700 | }, |
| 701 | "audio": { # Contains audio data in the encoding specified in the `RecognitionConfig`. # Required. The audio data to be recognized. |
| 702 | # Either `content` or `uri` must be supplied. Supplying both or neither |
| 703 | # returns google.rpc.Code.INVALID_ARGUMENT. See |
| 704 | # [content limits](https://cloud.google.com/speech-to-text/quotas#content). |
| 705 | "content": "A String", # The audio data bytes encoded as specified in |
| 706 | # `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a |
| 707 | # pure binary representation, whereas JSON representations use base64. |
| 708 | "uri": "A String", # URI that points to a file that contains audio data bytes as specified in |
| 709 | # `RecognitionConfig`. The file must not be compressed (for example, gzip). |
| 710 | # Currently, only Google Cloud Storage URIs are |
| 711 | # supported, which must be specified in the following format: |
| 712 | # `gs://bucket_name/object_name` (other URI formats return |
| 713 | # google.rpc.Code.INVALID_ARGUMENT). For more information, see |
| 714 | # [Request URIs](https://cloud.google.com/storage/docs/reference-uris). |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 715 | }, |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 716 | } |
| 717 | |
| 718 | x__xgafv: string, V1 error format. |
| 719 | Allowed values |
| 720 | 1 - v1 error format |
| 721 | 2 - v2 error format |
| 722 | |
| 723 | Returns: |
| 724 | An object of the form: |
| 725 | |
| 726 | { # The only message returned to the client by the `Recognize` method. It |
| 727 | # contains the result as zero or more sequential `SpeechRecognitionResult` |
| 728 | # messages. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 729 | "results": [ # Sequential list of transcription results corresponding to |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 730 | # sequential portions of audio. |
| 731 | { # A speech recognition result corresponding to a portion of the audio. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 732 | "channelTag": 42, # For multi-channel audio, this is the channel number corresponding to the |
Dan O'Meara | dd49464 | 2020-05-01 07:42:23 -0700 | [diff] [blame] | 733 | # recognized result for the audio from that channel. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 734 | # For audio_channel_count = N, its output values can range from '1' to 'N'. |
| 735 | "languageCode": "A String", # Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag |
| 736 | # of the language in this result. This language code was detected to have |
| 737 | # the most likelihood of being spoken in the audio. |
| 738 | "alternatives": [ # May contain one or more recognition hypotheses (up to the |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 739 | # maximum specified in `max_alternatives`). |
| 740 | # These alternatives are ordered in terms of accuracy, with the top (first) |
| 741 | # alternative being the most probable, as ranked by the recognizer. |
| 742 | { # Alternative hypotheses (a.k.a. n-best list). |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 743 | "confidence": 3.14, # The confidence estimate between 0.0 and 1.0. A higher number |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 744 | # indicates an estimated greater likelihood that the recognized words are |
| 745 | # correct. This field is set only for the top alternative of a non-streaming |
| 746 | # result or, of a streaming result where `is_final=true`. |
| 747 | # This field is not guaranteed to be accurate and users should not rely on it |
| 748 | # to be always provided. |
| 749 | # The default of 0.0 is a sentinel value indicating `confidence` was not set. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 750 | "transcript": "A String", # Transcript text representing the words that the user spoke. |
| 751 | "words": [ # A list of word-specific information for each recognized word. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 752 | # Note: When `enable_speaker_diarization` is true, you will see all the words |
| 753 | # from the beginning of the audio. |
| 754 | { # Word-specific information for recognized words. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 755 | "speakerTag": 42, # Output only. A distinct integer value is assigned for every speaker within |
| 756 | # the audio. This field specifies which one of those speakers was detected to |
| 757 | # have spoken this word. Value ranges from '1' to diarization_speaker_count. |
| 758 | # speaker_tag is set if enable_speaker_diarization = 'true' and only in the |
| 759 | # top alternative. |
| 760 | "endTime": "A String", # Time offset relative to the beginning of the audio, |
| 761 | # and corresponding to the end of the spoken word. |
| 762 | # This field is only set if `enable_word_time_offsets=true` and only |
| 763 | # in the top hypothesis. |
| 764 | # This is an experimental feature and the accuracy of the time offset can |
| 765 | # vary. |
| 766 | "confidence": 3.14, # The confidence estimate between 0.0 and 1.0. A higher number |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 767 | # indicates an estimated greater likelihood that the recognized words are |
| 768 | # correct. This field is set only for the top alternative of a non-streaming |
| 769 | # result or, of a streaming result where `is_final=true`. |
| 770 | # This field is not guaranteed to be accurate and users should not rely on it |
| 771 | # to be always provided. |
| 772 | # The default of 0.0 is a sentinel value indicating `confidence` was not set. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 773 | "startTime": "A String", # Time offset relative to the beginning of the audio, |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 774 | # and corresponding to the start of the spoken word. |
| 775 | # This field is only set if `enable_word_time_offsets=true` and only |
| 776 | # in the top hypothesis. |
| 777 | # This is an experimental feature and the accuracy of the time offset can |
| 778 | # vary. |
Bu Sun Kim | 6502091 | 2020-05-20 12:08:20 -0700 | [diff] [blame] | 779 | "word": "A String", # The word corresponding to this set of information. |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 780 | }, |
| 781 | ], |
| 782 | }, |
| 783 | ], |
Bu Sun Kim | 715bd7f | 2019-06-14 16:50:42 -0700 | [diff] [blame] | 784 | }, |
| 785 | ], |
| 786 | }</pre> |
| 787 | </div> |
| 788 | |
| 789 | </body></html> |