docs: update docs (#916)
* fix: re-run script
* test: fix noxfile
diff --git a/docs/dyn/speech_v1.speech.html b/docs/dyn/speech_v1.speech.html
index cfa98c7..fd71ac9 100644
--- a/docs/dyn/speech_v1.speech.html
+++ b/docs/dyn/speech_v1.speech.html
@@ -99,6 +99,73 @@
"config": { # Provides information to the recognizer that specifies how to process the # Required. Provides information to the recognizer that specifies how to
# process the request.
# request.
+ "enableWordTimeOffsets": True or False, # If `true`, the top result includes a list of words and
+ # the start and end time offsets (timestamps) for those words. If
+ # `false`, no word-level time offset information is returned. The default is
+ # `false`.
+ "diarizationConfig": { # Config to enable speaker diarization. # Config to enable speaker diarization and set additional
+ # parameters to make diarization better suited for your application.
+ # Note: When this is enabled, we send all the words from the beginning of the
+ # audio for the top alternative in every consecutive STREAMING responses.
+ # This is done in order to improve our speaker tags as our models learn to
+ # identify the speakers in the conversation over time.
+ # For non-streaming requests, the diarization results will be provided only
+ # in the top alternative of the FINAL SpeechRecognitionResult.
+ "minSpeakerCount": 42, # Minimum number of speakers in the conversation. This range gives you more
+ # flexibility by allowing the system to automatically determine the correct
+ # number of speakers. If not set, the default value is 2.
+ "maxSpeakerCount": 42, # Maximum number of speakers in the conversation. This range gives you more
+ # flexibility by allowing the system to automatically determine the correct
+ # number of speakers. If not set, the default value is 6.
+ "speakerTag": 42, # Output only. Unused.
+ "enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in
+ # the top alternative of the recognition result using a speaker_tag provided
+ # in the WordInfo.
+ },
+ "languageCode": "A String", # Required. The language of the supplied audio as a
+ # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
+ # Example: "en-US".
+ # See [Language
+ # Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
+ # of the currently supported language codes.
+ "profanityFilter": True or False, # If set to `true`, the server will attempt to filter out
+ # profanities, replacing all but the initial character in each filtered word
+ # with asterisks, e.g. "f***". If set to `false` or omitted, profanities
+ # won't be filtered out.
+ "useEnhanced": True or False, # Set to true to use an enhanced model for speech recognition.
+ # If `use_enhanced` is set to true and the `model` field is not set, then
+ # an appropriate enhanced model is chosen if an enhanced model exists for
+ # the audio.
+ #
+ # If `use_enhanced` is true and an enhanced version of the specified model
+ # does not exist, then the speech is recognized using the standard version
+ # of the specified model.
+ "metadata": { # Description of audio data to be recognized. # Metadata regarding this request.
+ "originalMediaType": "A String", # The original media the speech was recorded on.
+ "recordingDeviceType": "A String", # The type of device the speech was recorded with.
+ "interactionType": "A String", # The use case most closely describing the audio content to be recognized.
+ "audioTopic": "A String", # Description of the content. Eg. "Recordings of federal supreme court
+ # hearings from 2012".
+ "originalMimeType": "A String", # Mime type of the original audio file. For example `audio/m4a`,
+ # `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
+ # A list of possible audio mime types is maintained at
+ # http://www.iana.org/assignments/media-types/media-types.xhtml#audio
+ "recordingDeviceName": "A String", # The device used to make the recording. Examples 'Nexus 5X' or
+ # 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
+ # 'Cardioid Microphone'.
+ "industryNaicsCodeOfAudio": 42, # The industry vertical to which this speech recognition request most
+ # closely applies. This is most indicative of the topics contained
+ # in the audio. Use the 6-digit NAICS code to identify the industry
+ # vertical - see https://www.naics.com/search/.
+ "microphoneDistance": "A String", # The audio type that most closely describes the audio being recognized.
+ },
+ "sampleRateHertz": 42, # Sample rate in Hertz of the audio data sent in all
+ # `RecognitionAudio` messages. Valid values are: 8000-48000.
+ # 16000 is optimal. For best results, set the sampling rate of the audio
+ # source to 16000 Hz. If that's not possible, use the native sample rate of
+ # the audio source (instead of re-sampling).
+ # This field is optional for FLAC and WAV audio files, but is
+ # required for all other audio formats. For details, see AudioEncoding.
"enableSeparateRecognitionPerChannel": True or False, # This needs to be set to `true` explicitly and `audio_channel_count` > 1
# to get each channel recognized separately. The recognition result will
# contain a `channel_tag` field to state which channel that result belongs
@@ -115,9 +182,6 @@
# The server may return fewer than `max_alternatives`.
# Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
# one. If omitted, will return a maximum of one.
- "encoding": "A String", # Encoding of audio data sent in all `RecognitionAudio` messages.
- # This field is optional for `FLAC` and `WAV` audio files and required
- # for all other audio formats. For details, see AudioEncoding.
"speechContexts": [ # Array of SpeechContext.
# A means to provide context to assist the speech recognition. For more
# information, see
@@ -141,6 +205,9 @@
],
},
],
+ "encoding": "A String", # Encoding of audio data sent in all `RecognitionAudio` messages.
+ # This field is optional for `FLAC` and `WAV` audio files and required
+ # for all other audio formats. For details, see AudioEncoding.
"model": "A String", # Which model to select for the given request. Select the model
# best suited to your domain to get best results. If a model is not
# explicitly specified, then we auto-select a model based on the parameters
@@ -182,78 +249,14 @@
# Note: We only recognize the first channel by default.
# To perform independent recognition on each channel set
# `enable_separate_recognition_per_channel` to 'true'.
- "diarizationConfig": { # Config to enable speaker diarization. # Config to enable speaker diarization and set additional
- # parameters to make diarization better suited for your application.
- # Note: When this is enabled, we send all the words from the beginning of the
- # audio for the top alternative in every consecutive STREAMING responses.
- # This is done in order to improve our speaker tags as our models learn to
- # identify the speakers in the conversation over time.
- # For non-streaming requests, the diarization results will be provided only
- # in the top alternative of the FINAL SpeechRecognitionResult.
- "minSpeakerCount": 42, # Minimum number of speakers in the conversation. This range gives you more
- # flexibility by allowing the system to automatically determine the correct
- # number of speakers. If not set, the default value is 2.
- "maxSpeakerCount": 42, # Maximum number of speakers in the conversation. This range gives you more
- # flexibility by allowing the system to automatically determine the correct
- # number of speakers. If not set, the default value is 6.
- "speakerTag": 42, # Output only. Unused.
- "enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in
- # the top alternative of the recognition result using a speaker_tag provided
- # in the WordInfo.
- },
- "enableWordTimeOffsets": True or False, # If `true`, the top result includes a list of words and
- # the start and end time offsets (timestamps) for those words. If
- # `false`, no word-level time offset information is returned. The default is
- # `false`.
- "languageCode": "A String", # Required. The language of the supplied audio as a
- # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
- # Example: "en-US".
- # See [Language
- # Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
- # of the currently supported language codes.
- "profanityFilter": True or False, # If set to `true`, the server will attempt to filter out
- # profanities, replacing all but the initial character in each filtered word
- # with asterisks, e.g. "f***". If set to `false` or omitted, profanities
- # won't be filtered out.
- "useEnhanced": True or False, # Set to true to use an enhanced model for speech recognition.
- # If `use_enhanced` is set to true and the `model` field is not set, then
- # an appropriate enhanced model is chosen if an enhanced model exists for
- # the audio.
- #
- # If `use_enhanced` is true and an enhanced version of the specified model
- # does not exist, then the speech is recognized using the standard version
- # of the specified model.
- "metadata": { # Description of audio data to be recognized. # Metadata regarding this request.
- "recordingDeviceName": "A String", # The device used to make the recording. Examples 'Nexus 5X' or
- # 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
- # 'Cardioid Microphone'.
- "audioTopic": "A String", # Description of the content. Eg. "Recordings of federal supreme court
- # hearings from 2012".
- "originalMimeType": "A String", # Mime type of the original audio file. For example `audio/m4a`,
- # `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
- # A list of possible audio mime types is maintained at
- # http://www.iana.org/assignments/media-types/media-types.xhtml#audio
- "microphoneDistance": "A String", # The audio type that most closely describes the audio being recognized.
- "industryNaicsCodeOfAudio": 42, # The industry vertical to which this speech recognition request most
- # closely applies. This is most indicative of the topics contained
- # in the audio. Use the 6-digit NAICS code to identify the industry
- # vertical - see https://www.naics.com/search/.
- "originalMediaType": "A String", # The original media the speech was recorded on.
- "recordingDeviceType": "A String", # The type of device the speech was recorded with.
- "interactionType": "A String", # The use case most closely describing the audio content to be recognized.
- },
- "sampleRateHertz": 42, # Sample rate in Hertz of the audio data sent in all
- # `RecognitionAudio` messages. Valid values are: 8000-48000.
- # 16000 is optimal. For best results, set the sampling rate of the audio
- # source to 16000 Hz. If that's not possible, use the native sample rate of
- # the audio source (instead of re-sampling).
- # This field is optional for FLAC and WAV audio files, but is
- # required for all other audio formats. For details, see AudioEncoding.
},
"audio": { # Contains audio data in the encoding specified in the `RecognitionConfig`. # Required. The audio data to be recognized.
# Either `content` or `uri` must be supplied. Supplying both or neither
# returns google.rpc.Code.INVALID_ARGUMENT. See
# [content limits](https://cloud.google.com/speech-to-text/quotas#content).
+ "content": "A String", # The audio data bytes encoded as specified in
+ # `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
+ # pure binary representation, whereas JSON representations use base64.
"uri": "A String", # URI that points to a file that contains audio data bytes as specified in
# `RecognitionConfig`. The file must not be compressed (for example, gzip).
# Currently, only Google Cloud Storage URIs are
@@ -261,9 +264,6 @@
# `gs://bucket_name/object_name` (other URI formats return
# google.rpc.Code.INVALID_ARGUMENT). For more information, see
# [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
- "content": "A String", # The audio data bytes encoded as specified in
- # `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
- # pure binary representation, whereas JSON representations use base64.
},
}
@@ -277,9 +277,6 @@
{ # This resource represents a long-running operation that is the result of a
# network API call.
- "done": True or False, # If the value is `false`, it means the operation is still in progress.
- # If `true`, the operation is completed, and either `error` or `response` is
- # available.
"response": { # The normal response of the operation in case of success. If the original
# method returns no data on success, such as `Delete`, the response is
# `google.protobuf.Empty`. If the original method is standard
@@ -317,6 +314,9 @@
# long-running operation should document the metadata type, if any.
"a_key": "", # Properties of the object. Contains field @type with type URL.
},
+ "done": True or False, # If the value is `false`, it means the operation is still in progress.
+ # If `true`, the operation is completed, and either `error` or `response` is
+ # available.
}</pre>
</div>
@@ -333,6 +333,73 @@
"config": { # Provides information to the recognizer that specifies how to process the # Required. Provides information to the recognizer that specifies how to
# process the request.
# request.
+ "enableWordTimeOffsets": True or False, # If `true`, the top result includes a list of words and
+ # the start and end time offsets (timestamps) for those words. If
+ # `false`, no word-level time offset information is returned. The default is
+ # `false`.
+ "diarizationConfig": { # Config to enable speaker diarization. # Config to enable speaker diarization and set additional
+ # parameters to make diarization better suited for your application.
+ # Note: When this is enabled, we send all the words from the beginning of the
+ # audio for the top alternative in every consecutive STREAMING responses.
+ # This is done in order to improve our speaker tags as our models learn to
+ # identify the speakers in the conversation over time.
+ # For non-streaming requests, the diarization results will be provided only
+ # in the top alternative of the FINAL SpeechRecognitionResult.
+ "minSpeakerCount": 42, # Minimum number of speakers in the conversation. This range gives you more
+ # flexibility by allowing the system to automatically determine the correct
+ # number of speakers. If not set, the default value is 2.
+ "maxSpeakerCount": 42, # Maximum number of speakers in the conversation. This range gives you more
+ # flexibility by allowing the system to automatically determine the correct
+ # number of speakers. If not set, the default value is 6.
+ "speakerTag": 42, # Output only. Unused.
+ "enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in
+ # the top alternative of the recognition result using a speaker_tag provided
+ # in the WordInfo.
+ },
+ "languageCode": "A String", # Required. The language of the supplied audio as a
+ # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
+ # Example: "en-US".
+ # See [Language
+ # Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
+ # of the currently supported language codes.
+ "profanityFilter": True or False, # If set to `true`, the server will attempt to filter out
+ # profanities, replacing all but the initial character in each filtered word
+ # with asterisks, e.g. "f***". If set to `false` or omitted, profanities
+ # won't be filtered out.
+ "useEnhanced": True or False, # Set to true to use an enhanced model for speech recognition.
+ # If `use_enhanced` is set to true and the `model` field is not set, then
+ # an appropriate enhanced model is chosen if an enhanced model exists for
+ # the audio.
+ #
+ # If `use_enhanced` is true and an enhanced version of the specified model
+ # does not exist, then the speech is recognized using the standard version
+ # of the specified model.
+ "metadata": { # Description of audio data to be recognized. # Metadata regarding this request.
+ "originalMediaType": "A String", # The original media the speech was recorded on.
+ "recordingDeviceType": "A String", # The type of device the speech was recorded with.
+ "interactionType": "A String", # The use case most closely describing the audio content to be recognized.
+ "audioTopic": "A String", # Description of the content. Eg. "Recordings of federal supreme court
+ # hearings from 2012".
+ "originalMimeType": "A String", # Mime type of the original audio file. For example `audio/m4a`,
+ # `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
+ # A list of possible audio mime types is maintained at
+ # http://www.iana.org/assignments/media-types/media-types.xhtml#audio
+ "recordingDeviceName": "A String", # The device used to make the recording. Examples 'Nexus 5X' or
+ # 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
+ # 'Cardioid Microphone'.
+ "industryNaicsCodeOfAudio": 42, # The industry vertical to which this speech recognition request most
+ # closely applies. This is most indicative of the topics contained
+ # in the audio. Use the 6-digit NAICS code to identify the industry
+ # vertical - see https://www.naics.com/search/.
+ "microphoneDistance": "A String", # The audio type that most closely describes the audio being recognized.
+ },
+ "sampleRateHertz": 42, # Sample rate in Hertz of the audio data sent in all
+ # `RecognitionAudio` messages. Valid values are: 8000-48000.
+ # 16000 is optimal. For best results, set the sampling rate of the audio
+ # source to 16000 Hz. If that's not possible, use the native sample rate of
+ # the audio source (instead of re-sampling).
+ # This field is optional for FLAC and WAV audio files, but is
+ # required for all other audio formats. For details, see AudioEncoding.
"enableSeparateRecognitionPerChannel": True or False, # This needs to be set to `true` explicitly and `audio_channel_count` > 1
# to get each channel recognized separately. The recognition result will
# contain a `channel_tag` field to state which channel that result belongs
@@ -349,9 +416,6 @@
# The server may return fewer than `max_alternatives`.
# Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
# one. If omitted, will return a maximum of one.
- "encoding": "A String", # Encoding of audio data sent in all `RecognitionAudio` messages.
- # This field is optional for `FLAC` and `WAV` audio files and required
- # for all other audio formats. For details, see AudioEncoding.
"speechContexts": [ # Array of SpeechContext.
# A means to provide context to assist the speech recognition. For more
# information, see
@@ -375,6 +439,9 @@
],
},
],
+ "encoding": "A String", # Encoding of audio data sent in all `RecognitionAudio` messages.
+ # This field is optional for `FLAC` and `WAV` audio files and required
+ # for all other audio formats. For details, see AudioEncoding.
"model": "A String", # Which model to select for the given request. Select the model
# best suited to your domain to get best results. If a model is not
# explicitly specified, then we auto-select a model based on the parameters
@@ -416,78 +483,14 @@
# Note: We only recognize the first channel by default.
# To perform independent recognition on each channel set
# `enable_separate_recognition_per_channel` to 'true'.
- "diarizationConfig": { # Config to enable speaker diarization. # Config to enable speaker diarization and set additional
- # parameters to make diarization better suited for your application.
- # Note: When this is enabled, we send all the words from the beginning of the
- # audio for the top alternative in every consecutive STREAMING responses.
- # This is done in order to improve our speaker tags as our models learn to
- # identify the speakers in the conversation over time.
- # For non-streaming requests, the diarization results will be provided only
- # in the top alternative of the FINAL SpeechRecognitionResult.
- "minSpeakerCount": 42, # Minimum number of speakers in the conversation. This range gives you more
- # flexibility by allowing the system to automatically determine the correct
- # number of speakers. If not set, the default value is 2.
- "maxSpeakerCount": 42, # Maximum number of speakers in the conversation. This range gives you more
- # flexibility by allowing the system to automatically determine the correct
- # number of speakers. If not set, the default value is 6.
- "speakerTag": 42, # Output only. Unused.
- "enableSpeakerDiarization": True or False, # If 'true', enables speaker detection for each recognized word in
- # the top alternative of the recognition result using a speaker_tag provided
- # in the WordInfo.
- },
- "enableWordTimeOffsets": True or False, # If `true`, the top result includes a list of words and
- # the start and end time offsets (timestamps) for those words. If
- # `false`, no word-level time offset information is returned. The default is
- # `false`.
- "languageCode": "A String", # Required. The language of the supplied audio as a
- # [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
- # Example: "en-US".
- # See [Language
- # Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
- # of the currently supported language codes.
- "profanityFilter": True or False, # If set to `true`, the server will attempt to filter out
- # profanities, replacing all but the initial character in each filtered word
- # with asterisks, e.g. "f***". If set to `false` or omitted, profanities
- # won't be filtered out.
- "useEnhanced": True or False, # Set to true to use an enhanced model for speech recognition.
- # If `use_enhanced` is set to true and the `model` field is not set, then
- # an appropriate enhanced model is chosen if an enhanced model exists for
- # the audio.
- #
- # If `use_enhanced` is true and an enhanced version of the specified model
- # does not exist, then the speech is recognized using the standard version
- # of the specified model.
- "metadata": { # Description of audio data to be recognized. # Metadata regarding this request.
- "recordingDeviceName": "A String", # The device used to make the recording. Examples 'Nexus 5X' or
- # 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
- # 'Cardioid Microphone'.
- "audioTopic": "A String", # Description of the content. Eg. "Recordings of federal supreme court
- # hearings from 2012".
- "originalMimeType": "A String", # Mime type of the original audio file. For example `audio/m4a`,
- # `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
- # A list of possible audio mime types is maintained at
- # http://www.iana.org/assignments/media-types/media-types.xhtml#audio
- "microphoneDistance": "A String", # The audio type that most closely describes the audio being recognized.
- "industryNaicsCodeOfAudio": 42, # The industry vertical to which this speech recognition request most
- # closely applies. This is most indicative of the topics contained
- # in the audio. Use the 6-digit NAICS code to identify the industry
- # vertical - see https://www.naics.com/search/.
- "originalMediaType": "A String", # The original media the speech was recorded on.
- "recordingDeviceType": "A String", # The type of device the speech was recorded with.
- "interactionType": "A String", # The use case most closely describing the audio content to be recognized.
- },
- "sampleRateHertz": 42, # Sample rate in Hertz of the audio data sent in all
- # `RecognitionAudio` messages. Valid values are: 8000-48000.
- # 16000 is optimal. For best results, set the sampling rate of the audio
- # source to 16000 Hz. If that's not possible, use the native sample rate of
- # the audio source (instead of re-sampling).
- # This field is optional for FLAC and WAV audio files, but is
- # required for all other audio formats. For details, see AudioEncoding.
},
"audio": { # Contains audio data in the encoding specified in the `RecognitionConfig`. # Required. The audio data to be recognized.
# Either `content` or `uri` must be supplied. Supplying both or neither
# returns google.rpc.Code.INVALID_ARGUMENT. See
# [content limits](https://cloud.google.com/speech-to-text/quotas#content).
+ "content": "A String", # The audio data bytes encoded as specified in
+ # `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
+ # pure binary representation, whereas JSON representations use base64.
"uri": "A String", # URI that points to a file that contains audio data bytes as specified in
# `RecognitionConfig`. The file must not be compressed (for example, gzip).
# Currently, only Google Cloud Storage URIs are
@@ -495,9 +498,6 @@
# `gs://bucket_name/object_name` (other URI formats return
# google.rpc.Code.INVALID_ARGUMENT). For more information, see
# [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
- "content": "A String", # The audio data bytes encoded as specified in
- # `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
- # pure binary representation, whereas JSON representations use base64.
},
}
@@ -515,6 +515,9 @@
"results": [ # Sequential list of transcription results corresponding to
# sequential portions of audio.
{ # A speech recognition result corresponding to a portion of the audio.
+ "channelTag": 42, # For multi-channel audio, this is the channel number corresponding to the
+ # recognized result for the audio from that channel.
+ # For audio_channel_count = N, its output values can range from '1' to 'N'.
"alternatives": [ # May contain one or more recognition hypotheses (up to the
# maximum specified in `max_alternatives`).
# These alternatives are ordered in terms of accuracy, with the top (first)
@@ -532,12 +535,6 @@
# Note: When `enable_speaker_diarization` is true, you will see all the words
# from the beginning of the audio.
{ # Word-specific information for recognized words.
- "word": "A String", # The word corresponding to this set of information.
- "speakerTag": 42, # Output only. A distinct integer value is assigned for every speaker within
- # the audio. This field specifies which one of those speakers was detected to
- # have spoken this word. Value ranges from '1' to diarization_speaker_count.
- # speaker_tag is set if enable_speaker_diarization = 'true' and only in the
- # top alternative.
"endTime": "A String", # Time offset relative to the beginning of the audio,
# and corresponding to the end of the spoken word.
# This field is only set if `enable_word_time_offsets=true` and only
@@ -550,13 +547,16 @@
# in the top hypothesis.
# This is an experimental feature and the accuracy of the time offset can
# vary.
+ "word": "A String", # The word corresponding to this set of information.
+ "speakerTag": 42, # Output only. A distinct integer value is assigned for every speaker within
+ # the audio. This field specifies which one of those speakers was detected to
+ # have spoken this word. Value ranges from '1' to diarization_speaker_count.
+ # speaker_tag is set if enable_speaker_diarization = 'true' and only in the
+ # top alternative.
},
],
},
],
- "channelTag": 42, # For multi-channel audio, this is the channel number corresponding to the
- # recognized result for the audio from that channel.
- # For audio_channel_count = N, its output values can range from '1' to 'N'.
},
],
}</pre>