From 717172f8724dff8d0b921a3bf2c14d16169e141b Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Tue, 25 May 2021 06:28:45 -0400 Subject: [PATCH] chore: delete unused protos (#164) --- .../cloud/speech_v1/proto/cloud_speech.proto | 759 --------------- .../speech_v1p1beta1/proto/cloud_speech.proto | 881 ------------------ .../proto/cloud_speech_adaptation.proto | 320 ------- .../speech_v1p1beta1/proto/resource.proto | 132 --- 4 files changed, 2092 deletions(-) delete mode 100644 packages/google-cloud-python-speech/google/cloud/speech_v1/proto/cloud_speech.proto delete mode 100644 packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/cloud_speech.proto delete mode 100644 packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/cloud_speech_adaptation.proto delete mode 100644 packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/resource.proto diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1/proto/cloud_speech.proto b/packages/google-cloud-python-speech/google/cloud/speech_v1/proto/cloud_speech.proto deleted file mode 100644 index f343fa21da85..000000000000 --- a/packages/google-cloud-python-speech/google/cloud/speech_v1/proto/cloud_speech.proto +++ /dev/null @@ -1,759 +0,0 @@ -// Copyright 2019 Google LLC. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -syntax = "proto3"; - -package google.cloud.speech.v1; - -import "google/api/annotations.proto"; -import "google/api/client.proto"; -import "google/api/field_behavior.proto"; -import "google/longrunning/operations.proto"; -import "google/protobuf/any.proto"; -import "google/protobuf/duration.proto"; -import "google/protobuf/timestamp.proto"; -import "google/rpc/status.proto"; - -option cc_enable_arenas = true; -option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1;speech"; -option java_multiple_files = true; -option java_outer_classname = "SpeechProto"; -option java_package = "com.google.cloud.speech.v1"; -option objc_class_prefix = "GCS"; - -// Service that implements Google Cloud Speech API. -service Speech { - option (google.api.default_host) = "speech.googleapis.com"; - option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; - - // Performs synchronous speech recognition: receive results after all audio - // has been sent and processed. - rpc Recognize(RecognizeRequest) returns (RecognizeResponse) { - option (google.api.http) = { - post: "/v1/speech:recognize" - body: "*" - }; - option (google.api.method_signature) = "config,audio"; - } - - // Performs asynchronous speech recognition: receive results via the - // google.longrunning.Operations interface. Returns either an - // `Operation.error` or an `Operation.response` which contains - // a `LongRunningRecognizeResponse` message. - // For more information on asynchronous speech recognition, see the - // [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize). - rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) { - option (google.api.http) = { - post: "/v1/speech:longrunningrecognize" - body: "*" - }; - option (google.api.method_signature) = "config,audio"; - option (google.longrunning.operation_info) = { - response_type: "LongRunningRecognizeResponse" - metadata_type: "LongRunningRecognizeMetadata" - }; - } - - // Performs bidirectional streaming speech recognition: receive results while - // sending audio. This method is only available via the gRPC API (not REST). - rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) { - } -} - -// The top-level message sent by the client for the `Recognize` method. -message RecognizeRequest { - // Required. Provides information to the recognizer that specifies how to - // process the request. - RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED]; - - // Required. The audio data to be recognized. - RecognitionAudio audio = 2 [(google.api.field_behavior) = REQUIRED]; -} - -// The top-level message sent by the client for the `LongRunningRecognize` -// method. -message LongRunningRecognizeRequest { - // Required. Provides information to the recognizer that specifies how to - // process the request. - RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED]; - - // Required. The audio data to be recognized. - RecognitionAudio audio = 2 [(google.api.field_behavior) = REQUIRED]; -} - -// The top-level message sent by the client for the `StreamingRecognize` method. -// Multiple `StreamingRecognizeRequest` messages are sent. The first message -// must contain a `streaming_config` message and must not contain -// `audio_content`. All subsequent messages must contain `audio_content` and -// must not contain a `streaming_config` message. -message StreamingRecognizeRequest { - // The streaming request, which is either a streaming config or audio content. - oneof streaming_request { - // Provides information to the recognizer that specifies how to process the - // request. The first `StreamingRecognizeRequest` message must contain a - // `streaming_config` message. - StreamingRecognitionConfig streaming_config = 1; - - // The audio data to be recognized. Sequential chunks of audio data are sent - // in sequential `StreamingRecognizeRequest` messages. The first - // `StreamingRecognizeRequest` message must not contain `audio_content` data - // and all subsequent `StreamingRecognizeRequest` messages must contain - // `audio_content` data. The audio bytes must be encoded as specified in - // `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a - // pure binary representation (not base64). See - // [content limits](https://cloud.google.com/speech-to-text/quotas#content). - bytes audio_content = 2; - } -} - -// Provides information to the recognizer that specifies how to process the -// request. -message StreamingRecognitionConfig { - // Required. Provides information to the recognizer that specifies how to - // process the request. - RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED]; - - // If `false` or omitted, the recognizer will perform continuous - // recognition (continuing to wait for and process audio even if the user - // pauses speaking) until the client closes the input stream (gRPC API) or - // until the maximum time limit has been reached. May return multiple - // `StreamingRecognitionResult`s with the `is_final` flag set to `true`. - // - // If `true`, the recognizer will detect a single spoken utterance. When it - // detects that the user has paused or stopped speaking, it will return an - // `END_OF_SINGLE_UTTERANCE` event and cease recognition. It will return no - // more than one `StreamingRecognitionResult` with the `is_final` flag set to - // `true`. - bool single_utterance = 2; - - // If `true`, interim results (tentative hypotheses) may be - // returned as they become available (these interim results are indicated with - // the `is_final=false` flag). - // If `false` or omitted, only `is_final=true` result(s) are returned. - bool interim_results = 3; -} - -// Provides information to the recognizer that specifies how to process the -// request. -message RecognitionConfig { - // The encoding of the audio data sent in the request. - // - // All encodings support only 1 channel (mono) audio, unless the - // `audio_channel_count` and `enable_separate_recognition_per_channel` fields - // are set. - // - // For best results, the audio source should be captured and transmitted using - // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech - // recognition can be reduced if lossy codecs are used to capture or transmit - // audio, particularly if background noise is present. Lossy codecs include - // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, and `MP3`. - // - // The `FLAC` and `WAV` audio file formats include a header that describes the - // included audio content. You can request recognition for `WAV` files that - // contain either `LINEAR16` or `MULAW` encoded audio. - // If you send `FLAC` or `WAV` audio file format in - // your request, you do not need to specify an `AudioEncoding`; the audio - // encoding format is determined from the file header. If you specify - // an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the - // encoding configuration must match the encoding described in the audio - // header; otherwise the request returns an - // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code. - enum AudioEncoding { - // Not specified. - ENCODING_UNSPECIFIED = 0; - - // Uncompressed 16-bit signed little-endian samples (Linear PCM). - LINEAR16 = 1; - - // `FLAC` (Free Lossless Audio - // Codec) is the recommended encoding because it is - // lossless--therefore recognition is not compromised--and - // requires only about half the bandwidth of `LINEAR16`. `FLAC` stream - // encoding supports 16-bit and 24-bit samples, however, not all fields in - // `STREAMINFO` are supported. - FLAC = 2; - - // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. - MULAW = 3; - - // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. - AMR = 4; - - // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. - AMR_WB = 5; - - // Opus encoded audio frames in Ogg container - // ([OggOpus](https://wiki.xiph.org/OggOpus)). - // `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000. - OGG_OPUS = 6; - - // Although the use of lossy encodings is not recommended, if a very low - // bitrate encoding is required, `OGG_OPUS` is highly preferred over - // Speex encoding. The [Speex](https://speex.org/) encoding supported by - // Cloud Speech API has a header byte in each block, as in MIME type - // `audio/x-speex-with-header-byte`. - // It is a variant of the RTP Speex encoding defined in - // [RFC 5574](https://tools.ietf.org/html/rfc5574). - // The stream is a sequence of blocks, one block per RTP packet. Each block - // starts with a byte containing the length of the block, in bytes, followed - // by one or more frames of Speex data, padded to an integral number of - // bytes (octets) as specified in RFC 5574. In other words, each RTP header - // is replaced with a single byte containing the block length. Only Speex - // wideband is supported. `sample_rate_hertz` must be 16000. - SPEEX_WITH_HEADER_BYTE = 7; - } - - // Encoding of audio data sent in all `RecognitionAudio` messages. - // This field is optional for `FLAC` and `WAV` audio files and required - // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding]. - AudioEncoding encoding = 1; - - // Sample rate in Hertz of the audio data sent in all - // `RecognitionAudio` messages. Valid values are: 8000-48000. - // 16000 is optimal. For best results, set the sampling rate of the audio - // source to 16000 Hz. If that's not possible, use the native sample rate of - // the audio source (instead of re-sampling). - // This field is optional for FLAC and WAV audio files, but is - // required for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding]. - int32 sample_rate_hertz = 2; - - // The number of channels in the input audio data. - // ONLY set this for MULTI-CHANNEL recognition. - // Valid values for LINEAR16 and FLAC are `1`-`8`. - // Valid values for OGG_OPUS are '1'-'254'. - // Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`. - // If `0` or omitted, defaults to one channel (mono). - // Note: We only recognize the first channel by default. - // To perform independent recognition on each channel set - // `enable_separate_recognition_per_channel` to 'true'. - int32 audio_channel_count = 7; - - // This needs to be set to `true` explicitly and `audio_channel_count` > 1 - // to get each channel recognized separately. The recognition result will - // contain a `channel_tag` field to state which channel that result belongs - // to. If this is not true, we will only recognize the first channel. The - // request is billed cumulatively for all channels recognized: - // `audio_channel_count` multiplied by the length of the audio. - bool enable_separate_recognition_per_channel = 12; - - // Required. The language of the supplied audio as a - // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. - // Example: "en-US". - // See [Language - // Support](https://cloud.google.com/speech-to-text/docs/languages) for a list - // of the currently supported language codes. - string language_code = 3 [(google.api.field_behavior) = REQUIRED]; - - // Maximum number of recognition hypotheses to be returned. - // Specifically, the maximum number of `SpeechRecognitionAlternative` messages - // within each `SpeechRecognitionResult`. - // The server may return fewer than `max_alternatives`. - // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of - // one. If omitted, will return a maximum of one. - int32 max_alternatives = 4; - - // If set to `true`, the server will attempt to filter out - // profanities, replacing all but the initial character in each filtered word - // with asterisks, e.g. "f***". If set to `false` or omitted, profanities - // won't be filtered out. - bool profanity_filter = 5; - - // Array of [SpeechContext][google.cloud.speech.v1.SpeechContext]. - // A means to provide context to assist the speech recognition. For more - // information, see - // [speech - // adaptation](https://cloud.google.com/speech-to-text/docs/context-strength). - repeated SpeechContext speech_contexts = 6; - - // If `true`, the top result includes a list of words and - // the start and end time offsets (timestamps) for those words. If - // `false`, no word-level time offset information is returned. The default is - // `false`. - bool enable_word_time_offsets = 8; - - // If 'true', adds punctuation to recognition result hypotheses. - // This feature is only available in select languages. Setting this for - // requests in other languages has no effect at all. - // The default 'false' value does not add punctuation to result hypotheses. - // Note: This is currently offered as an experimental service, complimentary - // to all users. In the future this may be exclusively available as a - // premium feature. - bool enable_automatic_punctuation = 11; - - // Config to enable speaker diarization and set additional - // parameters to make diarization better suited for your application. - // Note: When this is enabled, we send all the words from the beginning of the - // audio for the top alternative in every consecutive STREAMING responses. - // This is done in order to improve our speaker tags as our models learn to - // identify the speakers in the conversation over time. - // For non-streaming requests, the diarization results will be provided only - // in the top alternative of the FINAL SpeechRecognitionResult. - SpeakerDiarizationConfig diarization_config = 19; - - // Metadata regarding this request. - RecognitionMetadata metadata = 9; - - // Which model to select for the given request. Select the model - // best suited to your domain to get best results. If a model is not - // explicitly specified, then we auto-select a model based on the parameters - // in the RecognitionConfig. - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - //
ModelDescription
command_and_searchBest for short queries such as voice commands or voice search.
phone_callBest for audio that originated from a phone call (typically - // recorded at an 8khz sampling rate).
videoBest for audio that originated from from video or includes multiple - // speakers. Ideally the audio is recorded at a 16khz or greater - // sampling rate. This is a premium model that costs more than the - // standard rate.
defaultBest for audio that is not one of the specific audio models. - // For example, long-form audio. Ideally the audio is high-fidelity, - // recorded at a 16khz or greater sampling rate.
- string model = 13; - - // Set to true to use an enhanced model for speech recognition. - // If `use_enhanced` is set to true and the `model` field is not set, then - // an appropriate enhanced model is chosen if an enhanced model exists for - // the audio. - // - // If `use_enhanced` is true and an enhanced version of the specified model - // does not exist, then the speech is recognized using the standard version - // of the specified model. - bool use_enhanced = 14; -} - -// Config to enable speaker diarization. -message SpeakerDiarizationConfig { - // If 'true', enables speaker detection for each recognized word in - // the top alternative of the recognition result using a speaker_tag provided - // in the WordInfo. - bool enable_speaker_diarization = 1; - - // Minimum number of speakers in the conversation. This range gives you more - // flexibility by allowing the system to automatically determine the correct - // number of speakers. If not set, the default value is 2. - int32 min_speaker_count = 2; - - // Maximum number of speakers in the conversation. This range gives you more - // flexibility by allowing the system to automatically determine the correct - // number of speakers. If not set, the default value is 6. - int32 max_speaker_count = 3; - - // Unused. - int32 speaker_tag = 5 - [(google.api.field_behavior) = OUTPUT_ONLY, deprecated = true]; -} - -// Description of audio data to be recognized. -message RecognitionMetadata { - // Use case categories that the audio recognition request can be described - // by. - enum InteractionType { - // Use case is either unknown or is something other than one of the other - // values below. - INTERACTION_TYPE_UNSPECIFIED = 0; - - // Multiple people in a conversation or discussion. For example in a - // meeting with two or more people actively participating. Typically - // all the primary people speaking would be in the same room (if not, - // see PHONE_CALL) - DISCUSSION = 1; - - // One or more persons lecturing or presenting to others, mostly - // uninterrupted. - PRESENTATION = 2; - - // A phone-call or video-conference in which two or more people, who are - // not in the same room, are actively participating. - PHONE_CALL = 3; - - // A recorded message intended for another person to listen to. - VOICEMAIL = 4; - - // Professionally produced audio (eg. TV Show, Podcast). - PROFESSIONALLY_PRODUCED = 5; - - // Transcribe spoken questions and queries into text. - VOICE_SEARCH = 6; - - // Transcribe voice commands, such as for controlling a device. - VOICE_COMMAND = 7; - - // Transcribe speech to text to create a written document, such as a - // text-message, email or report. - DICTATION = 8; - } - - // Enumerates the types of capture settings describing an audio file. - enum MicrophoneDistance { - // Audio type is not known. - MICROPHONE_DISTANCE_UNSPECIFIED = 0; - - // The audio was captured from a closely placed microphone. Eg. phone, - // dictaphone, or handheld microphone. Generally if there speaker is within - // 1 meter of the microphone. - NEARFIELD = 1; - - // The speaker if within 3 meters of the microphone. - MIDFIELD = 2; - - // The speaker is more than 3 meters away from the microphone. - FARFIELD = 3; - } - - // The original media the speech was recorded on. - enum OriginalMediaType { - // Unknown original media type. - ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0; - - // The speech data is an audio recording. - AUDIO = 1; - - // The speech data originally recorded on a video. - VIDEO = 2; - } - - // The type of device the speech was recorded with. - enum RecordingDeviceType { - // The recording device is unknown. - RECORDING_DEVICE_TYPE_UNSPECIFIED = 0; - - // Speech was recorded on a smartphone. - SMARTPHONE = 1; - - // Speech was recorded using a personal computer or tablet. - PC = 2; - - // Speech was recorded over a phone line. - PHONE_LINE = 3; - - // Speech was recorded in a vehicle. - VEHICLE = 4; - - // Speech was recorded outdoors. - OTHER_OUTDOOR_DEVICE = 5; - - // Speech was recorded indoors. - OTHER_INDOOR_DEVICE = 6; - } - - // The use case most closely describing the audio content to be recognized. - InteractionType interaction_type = 1; - - // The industry vertical to which this speech recognition request most - // closely applies. This is most indicative of the topics contained - // in the audio. Use the 6-digit NAICS code to identify the industry - // vertical - see https://www.naics.com/search/. - uint32 industry_naics_code_of_audio = 3; - - // The audio type that most closely describes the audio being recognized. - MicrophoneDistance microphone_distance = 4; - - // The original media the speech was recorded on. - OriginalMediaType original_media_type = 5; - - // The type of device the speech was recorded with. - RecordingDeviceType recording_device_type = 6; - - // The device used to make the recording. Examples 'Nexus 5X' or - // 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or - // 'Cardioid Microphone'. - string recording_device_name = 7; - - // Mime type of the original audio file. For example `audio/m4a`, - // `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`. - // A list of possible audio mime types is maintained at - // http://www.iana.org/assignments/media-types/media-types.xhtml#audio - string original_mime_type = 8; - - // Description of the content. Eg. "Recordings of federal supreme court - // hearings from 2012". - string audio_topic = 10; -} - -// Provides "hints" to the speech recognizer to favor specific words and phrases -// in the results. -message SpeechContext { - // A list of strings containing words and phrases "hints" so that - // the speech recognition is more likely to recognize them. This can be used - // to improve the accuracy for specific words and phrases, for example, if - // specific commands are typically spoken by the user. This can also be used - // to add additional words to the vocabulary of the recognizer. See - // [usage limits](https://cloud.google.com/speech-to-text/quotas#content). - // - // List items can also be set to classes for groups of words that represent - // common concepts that occur in natural language. For example, rather than - // providing phrase hints for every month of the year, using the $MONTH class - // improves the likelihood of correctly transcribing audio that includes - // months. - repeated string phrases = 1; -} - -// Contains audio data in the encoding specified in the `RecognitionConfig`. -// Either `content` or `uri` must be supplied. Supplying both or neither -// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See -// [content limits](https://cloud.google.com/speech-to-text/quotas#content). -message RecognitionAudio { - // The audio source, which is either inline content or a Google Cloud - // Storage uri. - oneof audio_source { - // The audio data bytes encoded as specified in - // `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a - // pure binary representation, whereas JSON representations use base64. - bytes content = 1; - - // URI that points to a file that contains audio data bytes as specified in - // `RecognitionConfig`. The file must not be compressed (for example, gzip). - // Currently, only Google Cloud Storage URIs are - // supported, which must be specified in the following format: - // `gs://bucket_name/object_name` (other URI formats return - // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see - // [Request URIs](https://cloud.google.com/storage/docs/reference-uris). - string uri = 2; - } -} - -// The only message returned to the client by the `Recognize` method. It -// contains the result as zero or more sequential `SpeechRecognitionResult` -// messages. -message RecognizeResponse { - // Sequential list of transcription results corresponding to - // sequential portions of audio. - repeated SpeechRecognitionResult results = 2; -} - -// The only message returned to the client by the `LongRunningRecognize` method. -// It contains the result as zero or more sequential `SpeechRecognitionResult` -// messages. It is included in the `result.response` field of the `Operation` -// returned by the `GetOperation` call of the `google::longrunning::Operations` -// service. -message LongRunningRecognizeResponse { - // Sequential list of transcription results corresponding to - // sequential portions of audio. - repeated SpeechRecognitionResult results = 2; -} - -// Describes the progress of a long-running `LongRunningRecognize` call. It is -// included in the `metadata` field of the `Operation` returned by the -// `GetOperation` call of the `google::longrunning::Operations` service. -message LongRunningRecognizeMetadata { - // Approximate percentage of audio processed thus far. Guaranteed to be 100 - // when the audio is fully processed and the results are available. - int32 progress_percent = 1; - - // Time when the request was received. - google.protobuf.Timestamp start_time = 2; - - // Time of the most recent processing update. - google.protobuf.Timestamp last_update_time = 3; -} - -// `StreamingRecognizeResponse` is the only message returned to the client by -// `StreamingRecognize`. A series of zero or more `StreamingRecognizeResponse` -// messages are streamed back to the client. If there is no recognizable -// audio, and `single_utterance` is set to false, then no messages are streamed -// back to the client. -// -// Here's an example of a series of ten `StreamingRecognizeResponse`s that might -// be returned while processing audio: -// -// 1. results { alternatives { transcript: "tube" } stability: 0.01 } -// -// 2. results { alternatives { transcript: "to be a" } stability: 0.01 } -// -// 3. results { alternatives { transcript: "to be" } stability: 0.9 } -// results { alternatives { transcript: " or not to be" } stability: 0.01 } -// -// 4. results { alternatives { transcript: "to be or not to be" -// confidence: 0.92 } -// alternatives { transcript: "to bee or not to bee" } -// is_final: true } -// -// 5. results { alternatives { transcript: " that's" } stability: 0.01 } -// -// 6. results { alternatives { transcript: " that is" } stability: 0.9 } -// results { alternatives { transcript: " the question" } stability: 0.01 } -// -// 7. results { alternatives { transcript: " that is the question" -// confidence: 0.98 } -// alternatives { transcript: " that was the question" } -// is_final: true } -// -// Notes: -// -// - Only two of the above responses #4 and #7 contain final results; they are -// indicated by `is_final: true`. Concatenating these together generates the -// full transcript: "to be or not to be that is the question". -// -// - The others contain interim `results`. #3 and #6 contain two interim -// `results`: the first portion has a high stability and is less likely to -// change; the second portion has a low stability and is very likely to -// change. A UI designer might choose to show only high stability `results`. -// -// - The specific `stability` and `confidence` values shown above are only for -// illustrative purposes. Actual values may vary. -// -// - In each response, only one of these fields will be set: -// `error`, -// `speech_event_type`, or -// one or more (repeated) `results`. -message StreamingRecognizeResponse { - // Indicates the type of speech event. - enum SpeechEventType { - // No speech event specified. - SPEECH_EVENT_UNSPECIFIED = 0; - - // This event indicates that the server has detected the end of the user's - // speech utterance and expects no additional speech. Therefore, the server - // will not process additional audio (although it may subsequently return - // additional results). The client should stop sending additional audio - // data, half-close the gRPC connection, and wait for any additional results - // until the server closes the gRPC connection. This event is only sent if - // `single_utterance` was set to `true`, and is not used otherwise. - END_OF_SINGLE_UTTERANCE = 1; - } - - // If set, returns a [google.rpc.Status][google.rpc.Status] message that - // specifies the error for the operation. - google.rpc.Status error = 1; - - // This repeated list contains zero or more results that - // correspond to consecutive portions of the audio currently being processed. - // It contains zero or one `is_final=true` result (the newly settled portion), - // followed by zero or more `is_final=false` results (the interim results). - repeated StreamingRecognitionResult results = 2; - - // Indicates the type of speech event. - SpeechEventType speech_event_type = 4; -} - -// A streaming speech recognition result corresponding to a portion of the audio -// that is currently being processed. -message StreamingRecognitionResult { - // May contain one or more recognition hypotheses (up to the - // maximum specified in `max_alternatives`). - // These alternatives are ordered in terms of accuracy, with the top (first) - // alternative being the most probable, as ranked by the recognizer. - repeated SpeechRecognitionAlternative alternatives = 1; - - // If `false`, this `StreamingRecognitionResult` represents an - // interim result that may change. If `true`, this is the final time the - // speech service will return this particular `StreamingRecognitionResult`, - // the recognizer will not return any further hypotheses for this portion of - // the transcript and corresponding audio. - bool is_final = 2; - - // An estimate of the likelihood that the recognizer will not - // change its guess about this interim result. Values range from 0.0 - // (completely unstable) to 1.0 (completely stable). - // This field is only provided for interim results (`is_final=false`). - // The default of 0.0 is a sentinel value indicating `stability` was not set. - float stability = 3; - - // Time offset of the end of this result relative to the - // beginning of the audio. - google.protobuf.Duration result_end_time = 4; - - // For multi-channel audio, this is the channel number corresponding to the - // recognized result for the audio from that channel. - // For audio_channel_count = N, its output values can range from '1' to 'N'. - int32 channel_tag = 5; - - // The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of - // the language in this result. This language code was detected to have the - // most likelihood of being spoken in the audio. - string language_code = 6 - [(google.api.field_behavior) = OUTPUT_ONLY]; -} - -// A speech recognition result corresponding to a portion of the audio. -message SpeechRecognitionResult { - // May contain one or more recognition hypotheses (up to the - // maximum specified in `max_alternatives`). - // These alternatives are ordered in terms of accuracy, with the top (first) - // alternative being the most probable, as ranked by the recognizer. - repeated SpeechRecognitionAlternative alternatives = 1; - - // For multi-channel audio, this is the channel number corresponding to the - // recognized result for the audio from that channel. - // For audio_channel_count = N, its output values can range from '1' to 'N'. - int32 channel_tag = 2; -} - -// Alternative hypotheses (a.k.a. n-best list). -message SpeechRecognitionAlternative { - // Transcript text representing the words that the user spoke. - string transcript = 1; - - // The confidence estimate between 0.0 and 1.0. A higher number - // indicates an estimated greater likelihood that the recognized words are - // correct. This field is set only for the top alternative of a non-streaming - // result or, of a streaming result where `is_final=true`. - // This field is not guaranteed to be accurate and users should not rely on it - // to be always provided. - // The default of 0.0 is a sentinel value indicating `confidence` was not set. - float confidence = 2; - - // A list of word-specific information for each recognized word. - // Note: When `enable_speaker_diarization` is true, you will see all the words - // from the beginning of the audio. - repeated WordInfo words = 3; -} - -// Word-specific information for recognized words. -message WordInfo { - // Time offset relative to the beginning of the audio, - // and corresponding to the start of the spoken word. - // This field is only set if `enable_word_time_offsets=true` and only - // in the top hypothesis. - // This is an experimental feature and the accuracy of the time offset can - // vary. - google.protobuf.Duration start_time = 1; - - // Time offset relative to the beginning of the audio, - // and corresponding to the end of the spoken word. - // This field is only set if `enable_word_time_offsets=true` and only - // in the top hypothesis. - // This is an experimental feature and the accuracy of the time offset can - // vary. - google.protobuf.Duration end_time = 2; - - // The word corresponding to this set of information. - string word = 3; - - // A distinct integer value is assigned for every speaker within - // the audio. This field specifies which one of those speakers was detected to - // have spoken this word. Value ranges from '1' to diarization_speaker_count. - // speaker_tag is set if enable_speaker_diarization = 'true' and only in the - // top alternative. - int32 speaker_tag = 5 - [(google.api.field_behavior) = OUTPUT_ONLY]; -} diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/cloud_speech.proto b/packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/cloud_speech.proto deleted file mode 100644 index cf183d0129d3..000000000000 --- a/packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/cloud_speech.proto +++ /dev/null @@ -1,881 +0,0 @@ - -// Copyright 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto3"; - -package google.cloud.speech.v1p1beta1; - -import "google/api/annotations.proto"; -import "google/api/client.proto"; -import "google/api/field_behavior.proto"; -import "google/cloud/speech/v1p1beta1/resource.proto"; -import "google/longrunning/operations.proto"; -import "google/protobuf/any.proto"; -import "google/protobuf/duration.proto"; -import "google/protobuf/timestamp.proto"; -import "google/protobuf/wrappers.proto"; -import "google/rpc/status.proto"; - -option cc_enable_arenas = true; -option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1p1beta1;speech"; -option java_multiple_files = true; -option java_outer_classname = "SpeechProto"; -option java_package = "com.google.cloud.speech.v1p1beta1"; -option objc_class_prefix = "GCS"; - -// Service that implements Google Cloud Speech API. -service Speech { - option (google.api.default_host) = "speech.googleapis.com"; - option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; - - // Performs synchronous speech recognition: receive results after all audio - // has been sent and processed. - rpc Recognize(RecognizeRequest) returns (RecognizeResponse) { - option (google.api.http) = { - post: "/v1p1beta1/speech:recognize" - body: "*" - }; - option (google.api.method_signature) = "config,audio"; - } - - // Performs asynchronous speech recognition: receive results via the - // google.longrunning.Operations interface. Returns either an - // `Operation.error` or an `Operation.response` which contains - // a `LongRunningRecognizeResponse` message. - // For more information on asynchronous speech recognition, see the - // [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize). - rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) { - option (google.api.http) = { - post: "/v1p1beta1/speech:longrunningrecognize" - body: "*" - }; - option (google.api.method_signature) = "config,audio"; - option (google.longrunning.operation_info) = { - response_type: "LongRunningRecognizeResponse" - metadata_type: "LongRunningRecognizeMetadata" - }; - } - - // Performs bidirectional streaming speech recognition: receive results while - // sending audio. This method is only available via the gRPC API (not REST). - rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) { - } -} - -// The top-level message sent by the client for the `Recognize` method. -message RecognizeRequest { - // Required. Provides information to the recognizer that specifies how to - // process the request. - RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED]; - - // Required. The audio data to be recognized. - RecognitionAudio audio = 2 [(google.api.field_behavior) = REQUIRED]; -} - -// The top-level message sent by the client for the `LongRunningRecognize` -// method. -message LongRunningRecognizeRequest { - // Required. Provides information to the recognizer that specifies how to - // process the request. - RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED]; - - // Required. The audio data to be recognized. - RecognitionAudio audio = 2 [(google.api.field_behavior) = REQUIRED]; - - // Optional. Specifies an optional destination for the recognition results. - TranscriptOutputConfig output_config = 4 [(google.api.field_behavior) = OPTIONAL]; -} - -// Specifies an optional destination for the recognition results. -message TranscriptOutputConfig { - oneof output_type { - // Specifies a Cloud Storage URI for the recognition results. Must be - // specified in the format: `gs://bucket_name/object_name`, and the bucket - // must already exist. - string gcs_uri = 1; - } -} - -// The top-level message sent by the client for the `StreamingRecognize` method. -// Multiple `StreamingRecognizeRequest` messages are sent. The first message -// must contain a `streaming_config` message and must not contain -// `audio_content`. All subsequent messages must contain `audio_content` and -// must not contain a `streaming_config` message. -message StreamingRecognizeRequest { - // The streaming request, which is either a streaming config or audio content. - oneof streaming_request { - // Provides information to the recognizer that specifies how to process the - // request. The first `StreamingRecognizeRequest` message must contain a - // `streaming_config` message. - StreamingRecognitionConfig streaming_config = 1; - - // The audio data to be recognized. Sequential chunks of audio data are sent - // in sequential `StreamingRecognizeRequest` messages. The first - // `StreamingRecognizeRequest` message must not contain `audio_content` data - // and all subsequent `StreamingRecognizeRequest` messages must contain - // `audio_content` data. The audio bytes must be encoded as specified in - // `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a - // pure binary representation (not base64). See - // [content limits](https://cloud.google.com/speech-to-text/quotas#content). - bytes audio_content = 2; - } -} - -// Provides information to the recognizer that specifies how to process the -// request. -message StreamingRecognitionConfig { - // Required. Provides information to the recognizer that specifies how to - // process the request. - RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED]; - - // If `false` or omitted, the recognizer will perform continuous - // recognition (continuing to wait for and process audio even if the user - // pauses speaking) until the client closes the input stream (gRPC API) or - // until the maximum time limit has been reached. May return multiple - // `StreamingRecognitionResult`s with the `is_final` flag set to `true`. - // - // If `true`, the recognizer will detect a single spoken utterance. When it - // detects that the user has paused or stopped speaking, it will return an - // `END_OF_SINGLE_UTTERANCE` event and cease recognition. It will return no - // more than one `StreamingRecognitionResult` with the `is_final` flag set to - // `true`. - // - // The `single_utterance` field can only be used with specified models, - // otherwise an error is thrown. The `model` field in [`RecognitionConfig`][] - // must be set to: - // - // * `command_and_search` - // * `phone_call` AND additional field `useEnhanced`=`true` - // * The `model` field is left undefined. In this case the API auto-selects - // a model based on any other parameters that you set in - // `RecognitionConfig`. - bool single_utterance = 2; - - // If `true`, interim results (tentative hypotheses) may be - // returned as they become available (these interim results are indicated with - // the `is_final=false` flag). - // If `false` or omitted, only `is_final=true` result(s) are returned. - bool interim_results = 3; -} - -// Provides information to the recognizer that specifies how to process the -// request. -message RecognitionConfig { - // The encoding of the audio data sent in the request. - // - // All encodings support only 1 channel (mono) audio, unless the - // `audio_channel_count` and `enable_separate_recognition_per_channel` fields - // are set. - // - // For best results, the audio source should be captured and transmitted using - // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech - // recognition can be reduced if lossy codecs are used to capture or transmit - // audio, particularly if background noise is present. Lossy codecs include - // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`. - // - // The `FLAC` and `WAV` audio file formats include a header that describes the - // included audio content. You can request recognition for `WAV` files that - // contain either `LINEAR16` or `MULAW` encoded audio. - // If you send `FLAC` or `WAV` audio file format in - // your request, you do not need to specify an `AudioEncoding`; the audio - // encoding format is determined from the file header. If you specify - // an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the - // encoding configuration must match the encoding described in the audio - // header; otherwise the request returns an - // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code. - enum AudioEncoding { - // Not specified. - ENCODING_UNSPECIFIED = 0; - - // Uncompressed 16-bit signed little-endian samples (Linear PCM). - LINEAR16 = 1; - - // `FLAC` (Free Lossless Audio - // Codec) is the recommended encoding because it is - // lossless--therefore recognition is not compromised--and - // requires only about half the bandwidth of `LINEAR16`. `FLAC` stream - // encoding supports 16-bit and 24-bit samples, however, not all fields in - // `STREAMINFO` are supported. - FLAC = 2; - - // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. - MULAW = 3; - - // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. - AMR = 4; - - // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. - AMR_WB = 5; - - // Opus encoded audio frames in Ogg container - // ([OggOpus](https://wiki.xiph.org/OggOpus)). - // `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000. - OGG_OPUS = 6; - - // Although the use of lossy encodings is not recommended, if a very low - // bitrate encoding is required, `OGG_OPUS` is highly preferred over - // Speex encoding. The [Speex](https://speex.org/) encoding supported by - // Cloud Speech API has a header byte in each block, as in MIME type - // `audio/x-speex-with-header-byte`. - // It is a variant of the RTP Speex encoding defined in - // [RFC 5574](https://tools.ietf.org/html/rfc5574). - // The stream is a sequence of blocks, one block per RTP packet. Each block - // starts with a byte containing the length of the block, in bytes, followed - // by one or more frames of Speex data, padded to an integral number of - // bytes (octets) as specified in RFC 5574. In other words, each RTP header - // is replaced with a single byte containing the block length. Only Speex - // wideband is supported. `sample_rate_hertz` must be 16000. - SPEEX_WITH_HEADER_BYTE = 7; - - // MP3 audio. MP3 encoding is a Beta feature and only available in - // v1p1beta1. Support all standard MP3 bitrates (which range from 32-320 - // kbps). When using this encoding, `sample_rate_hertz` has to match the - // sample rate of the file being used. - MP3 = 8; - } - - // Encoding of audio data sent in all `RecognitionAudio` messages. - // This field is optional for `FLAC` and `WAV` audio files and required - // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding]. - AudioEncoding encoding = 1; - - // Sample rate in Hertz of the audio data sent in all - // `RecognitionAudio` messages. Valid values are: 8000-48000. - // 16000 is optimal. For best results, set the sampling rate of the audio - // source to 16000 Hz. If that's not possible, use the native sample rate of - // the audio source (instead of re-sampling). - // This field is optional for FLAC and WAV audio files, but is - // required for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding]. - int32 sample_rate_hertz = 2; - - // The number of channels in the input audio data. - // ONLY set this for MULTI-CHANNEL recognition. - // Valid values for LINEAR16 and FLAC are `1`-`8`. - // Valid values for OGG_OPUS are '1'-'254'. - // Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`. - // If `0` or omitted, defaults to one channel (mono). - // Note: We only recognize the first channel by default. - // To perform independent recognition on each channel set - // `enable_separate_recognition_per_channel` to 'true'. - int32 audio_channel_count = 7; - - // This needs to be set to `true` explicitly and `audio_channel_count` > 1 - // to get each channel recognized separately. The recognition result will - // contain a `channel_tag` field to state which channel that result belongs - // to. If this is not true, we will only recognize the first channel. The - // request is billed cumulatively for all channels recognized: - // `audio_channel_count` multiplied by the length of the audio. - bool enable_separate_recognition_per_channel = 12; - - // Required. The language of the supplied audio as a - // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. - // Example: "en-US". - // See [Language - // Support](https://cloud.google.com/speech-to-text/docs/languages) for a list - // of the currently supported language codes. - string language_code = 3 [(google.api.field_behavior) = REQUIRED]; - - // A list of up to 3 additional - // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags, - // listing possible alternative languages of the supplied audio. - // See [Language - // Support](https://cloud.google.com/speech-to-text/docs/languages) for a list - // of the currently supported language codes. If alternative languages are - // listed, recognition result will contain recognition in the most likely - // language detected including the main language_code. The recognition result - // will include the language tag of the language detected in the audio. Note: - // This feature is only supported for Voice Command and Voice Search use cases - // and performance may vary for other use cases (e.g., phone call - // transcription). - repeated string alternative_language_codes = 18; - - // Maximum number of recognition hypotheses to be returned. - // Specifically, the maximum number of `SpeechRecognitionAlternative` messages - // within each `SpeechRecognitionResult`. - // The server may return fewer than `max_alternatives`. - // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of - // one. If omitted, will return a maximum of one. - int32 max_alternatives = 4; - - // If set to `true`, the server will attempt to filter out - // profanities, replacing all but the initial character in each filtered word - // with asterisks, e.g. "f***". If set to `false` or omitted, profanities - // won't be filtered out. - bool profanity_filter = 5; - - // Speech adaptation configuration improves the accuracy of speech - // recognition. When speech adaptation is set it supersedes the - // `speech_contexts` field. For more information, see the [speech - // adaptation](https://cloud.google.com/speech-to-text/docs/adaptation) - // documentation. - SpeechAdaptation adaptation = 20; - - // Array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext]. - // A means to provide context to assist the speech recognition. For more - // information, see - // [speech - // adaptation](https://cloud.google.com/speech-to-text/docs/adaptation). - repeated SpeechContext speech_contexts = 6; - - // If `true`, the top result includes a list of words and - // the start and end time offsets (timestamps) for those words. If - // `false`, no word-level time offset information is returned. The default is - // `false`. - bool enable_word_time_offsets = 8; - - // If `true`, the top result includes a list of words and the - // confidence for those words. If `false`, no word-level confidence - // information is returned. The default is `false`. - bool enable_word_confidence = 15; - - // If 'true', adds punctuation to recognition result hypotheses. - // This feature is only available in select languages. Setting this for - // requests in other languages has no effect at all. - // The default 'false' value does not add punctuation to result hypotheses. - bool enable_automatic_punctuation = 11; - - // The spoken punctuation behavior for the call - // If not set, uses default behavior based on model of choice - // e.g. command_and_search will enable spoken punctuation by default - // If 'true', replaces spoken punctuation with the corresponding symbols in - // the request. For example, "how are you question mark" becomes "how are - // you?". See https://cloud.google.com/speech-to-text/docs/spoken-punctuation - // for support. If 'false', spoken punctuation is not replaced. - google.protobuf.BoolValue enable_spoken_punctuation = 22; - - // The spoken emoji behavior for the call - // If not set, uses default behavior based on model of choice - // If 'true', adds spoken emoji formatting for the request. This will replace - // spoken emojis with the corresponding Unicode symbols in the final - // transcript. If 'false', spoken emojis are not replaced. - google.protobuf.BoolValue enable_spoken_emojis = 23; - - // If 'true', enables speaker detection for each recognized word in - // the top alternative of the recognition result using a speaker_tag provided - // in the WordInfo. - // Note: Use diarization_config instead. - bool enable_speaker_diarization = 16 [deprecated = true]; - - // If set, specifies the estimated number of speakers in the conversation. - // Defaults to '2'. Ignored unless enable_speaker_diarization is set to true. - // Note: Use diarization_config instead. - int32 diarization_speaker_count = 17 [deprecated = true]; - - // Config to enable speaker diarization and set additional - // parameters to make diarization better suited for your application. - // Note: When this is enabled, we send all the words from the beginning of the - // audio for the top alternative in every consecutive STREAMING responses. - // This is done in order to improve our speaker tags as our models learn to - // identify the speakers in the conversation over time. - // For non-streaming requests, the diarization results will be provided only - // in the top alternative of the FINAL SpeechRecognitionResult. - SpeakerDiarizationConfig diarization_config = 19; - - // Metadata regarding this request. - RecognitionMetadata metadata = 9; - - // Which model to select for the given request. Select the model - // best suited to your domain to get best results. If a model is not - // explicitly specified, then we auto-select a model based on the parameters - // in the RecognitionConfig. - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - //
ModelDescription
command_and_searchBest for short queries such as voice commands or voice search.
phone_callBest for audio that originated from a phone call (typically - // recorded at an 8khz sampling rate).
videoBest for audio that originated from from video or includes multiple - // speakers. Ideally the audio is recorded at a 16khz or greater - // sampling rate. This is a premium model that costs more than the - // standard rate.
defaultBest for audio that is not one of the specific audio models. - // For example, long-form audio. Ideally the audio is high-fidelity, - // recorded at a 16khz or greater sampling rate.
- string model = 13; - - // Set to true to use an enhanced model for speech recognition. - // If `use_enhanced` is set to true and the `model` field is not set, then - // an appropriate enhanced model is chosen if an enhanced model exists for - // the audio. - // - // If `use_enhanced` is true and an enhanced version of the specified model - // does not exist, then the speech is recognized using the standard version - // of the specified model. - bool use_enhanced = 14; -} - -// Config to enable speaker diarization. -message SpeakerDiarizationConfig { - // If 'true', enables speaker detection for each recognized word in - // the top alternative of the recognition result using a speaker_tag provided - // in the WordInfo. - bool enable_speaker_diarization = 1; - - // Minimum number of speakers in the conversation. This range gives you more - // flexibility by allowing the system to automatically determine the correct - // number of speakers. If not set, the default value is 2. - int32 min_speaker_count = 2; - - // Maximum number of speakers in the conversation. This range gives you more - // flexibility by allowing the system to automatically determine the correct - // number of speakers. If not set, the default value is 6. - int32 max_speaker_count = 3; - - // Output only. Unused. - int32 speaker_tag = 5 [ - deprecated = true, - (google.api.field_behavior) = OUTPUT_ONLY - ]; -} - -// Description of audio data to be recognized. -message RecognitionMetadata { - // Use case categories that the audio recognition request can be described - // by. - enum InteractionType { - // Use case is either unknown or is something other than one of the other - // values below. - INTERACTION_TYPE_UNSPECIFIED = 0; - - // Multiple people in a conversation or discussion. For example in a - // meeting with two or more people actively participating. Typically - // all the primary people speaking would be in the same room (if not, - // see PHONE_CALL) - DISCUSSION = 1; - - // One or more persons lecturing or presenting to others, mostly - // uninterrupted. - PRESENTATION = 2; - - // A phone-call or video-conference in which two or more people, who are - // not in the same room, are actively participating. - PHONE_CALL = 3; - - // A recorded message intended for another person to listen to. - VOICEMAIL = 4; - - // Professionally produced audio (eg. TV Show, Podcast). - PROFESSIONALLY_PRODUCED = 5; - - // Transcribe spoken questions and queries into text. - VOICE_SEARCH = 6; - - // Transcribe voice commands, such as for controlling a device. - VOICE_COMMAND = 7; - - // Transcribe speech to text to create a written document, such as a - // text-message, email or report. - DICTATION = 8; - } - - // Enumerates the types of capture settings describing an audio file. - enum MicrophoneDistance { - // Audio type is not known. - MICROPHONE_DISTANCE_UNSPECIFIED = 0; - - // The audio was captured from a closely placed microphone. Eg. phone, - // dictaphone, or handheld microphone. Generally if there speaker is within - // 1 meter of the microphone. - NEARFIELD = 1; - - // The speaker if within 3 meters of the microphone. - MIDFIELD = 2; - - // The speaker is more than 3 meters away from the microphone. - FARFIELD = 3; - } - - // The original media the speech was recorded on. - enum OriginalMediaType { - // Unknown original media type. - ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0; - - // The speech data is an audio recording. - AUDIO = 1; - - // The speech data originally recorded on a video. - VIDEO = 2; - } - - // The type of device the speech was recorded with. - enum RecordingDeviceType { - // The recording device is unknown. - RECORDING_DEVICE_TYPE_UNSPECIFIED = 0; - - // Speech was recorded on a smartphone. - SMARTPHONE = 1; - - // Speech was recorded using a personal computer or tablet. - PC = 2; - - // Speech was recorded over a phone line. - PHONE_LINE = 3; - - // Speech was recorded in a vehicle. - VEHICLE = 4; - - // Speech was recorded outdoors. - OTHER_OUTDOOR_DEVICE = 5; - - // Speech was recorded indoors. - OTHER_INDOOR_DEVICE = 6; - } - - // The use case most closely describing the audio content to be recognized. - InteractionType interaction_type = 1; - - // The industry vertical to which this speech recognition request most - // closely applies. This is most indicative of the topics contained - // in the audio. Use the 6-digit NAICS code to identify the industry - // vertical - see https://www.naics.com/search/. - uint32 industry_naics_code_of_audio = 3; - - // The audio type that most closely describes the audio being recognized. - MicrophoneDistance microphone_distance = 4; - - // The original media the speech was recorded on. - OriginalMediaType original_media_type = 5; - - // The type of device the speech was recorded with. - RecordingDeviceType recording_device_type = 6; - - // The device used to make the recording. Examples 'Nexus 5X' or - // 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or - // 'Cardioid Microphone'. - string recording_device_name = 7; - - // Mime type of the original audio file. For example `audio/m4a`, - // `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`. - // A list of possible audio mime types is maintained at - // http://www.iana.org/assignments/media-types/media-types.xhtml#audio - string original_mime_type = 8; - - // Obfuscated (privacy-protected) ID of the user, to identify number of - // unique users using the service. - int64 obfuscated_id = 9 [deprecated = true]; - - // Description of the content. Eg. "Recordings of federal supreme court - // hearings from 2012". - string audio_topic = 10; -} - -// Provides "hints" to the speech recognizer to favor specific words and phrases -// in the results. -message SpeechContext { - // A list of strings containing words and phrases "hints" so that - // the speech recognition is more likely to recognize them. This can be used - // to improve the accuracy for specific words and phrases, for example, if - // specific commands are typically spoken by the user. This can also be used - // to add additional words to the vocabulary of the recognizer. See - // [usage limits](https://cloud.google.com/speech-to-text/quotas#content). - // - // List items can also be set to classes for groups of words that represent - // common concepts that occur in natural language. For example, rather than - // providing phrase hints for every month of the year, using the $MONTH class - // improves the likelihood of correctly transcribing audio that includes - // months. - repeated string phrases = 1; - - // Hint Boost. Positive value will increase the probability that a specific - // phrase will be recognized over other similar sounding phrases. The higher - // the boost, the higher the chance of false positive recognition as well. - // Negative boost values would correspond to anti-biasing. Anti-biasing is not - // enabled, so negative boost will simply be ignored. Though `boost` can - // accept a wide range of positive values, most use cases are best served with - // values between 0 and 20. We recommend using a binary search approach to - // finding the optimal value for your use case. - float boost = 4; -} - -// Contains audio data in the encoding specified in the `RecognitionConfig`. -// Either `content` or `uri` must be supplied. Supplying both or neither -// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See -// [content limits](https://cloud.google.com/speech-to-text/quotas#content). -message RecognitionAudio { - // The audio source, which is either inline content or a Google Cloud - // Storage uri. - oneof audio_source { - // The audio data bytes encoded as specified in - // `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a - // pure binary representation, whereas JSON representations use base64. - bytes content = 1; - - // URI that points to a file that contains audio data bytes as specified in - // `RecognitionConfig`. The file must not be compressed (for example, gzip). - // Currently, only Google Cloud Storage URIs are - // supported, which must be specified in the following format: - // `gs://bucket_name/object_name` (other URI formats return - // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see - // [Request URIs](https://cloud.google.com/storage/docs/reference-uris). - string uri = 2; - } -} - -// The only message returned to the client by the `Recognize` method. It -// contains the result as zero or more sequential `SpeechRecognitionResult` -// messages. -message RecognizeResponse { - // Sequential list of transcription results corresponding to - // sequential portions of audio. - repeated SpeechRecognitionResult results = 2; -} - -// The only message returned to the client by the `LongRunningRecognize` method. -// It contains the result as zero or more sequential `SpeechRecognitionResult` -// messages. It is included in the `result.response` field of the `Operation` -// returned by the `GetOperation` call of the `google::longrunning::Operations` -// service. -message LongRunningRecognizeResponse { - // Sequential list of transcription results corresponding to - // sequential portions of audio. - repeated SpeechRecognitionResult results = 2; - - // Original output config if present in the request. - TranscriptOutputConfig output_config = 6; - - // If the transcript output fails this field contains the relevant error. - google.rpc.Status output_error = 7; -} - -// Describes the progress of a long-running `LongRunningRecognize` call. It is -// included in the `metadata` field of the `Operation` returned by the -// `GetOperation` call of the `google::longrunning::Operations` service. -message LongRunningRecognizeMetadata { - // Approximate percentage of audio processed thus far. Guaranteed to be 100 - // when the audio is fully processed and the results are available. - int32 progress_percent = 1; - - // Time when the request was received. - google.protobuf.Timestamp start_time = 2; - - // Time of the most recent processing update. - google.protobuf.Timestamp last_update_time = 3; - - // Output only. The URI of the audio file being transcribed. Empty if the audio was sent - // as byte content. - string uri = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; - - // Output only. A copy of the TranscriptOutputConfig if it was set in the request. - TranscriptOutputConfig output_config = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; -} - -// `StreamingRecognizeResponse` is the only message returned to the client by -// `StreamingRecognize`. A series of zero or more `StreamingRecognizeResponse` -// messages are streamed back to the client. If there is no recognizable -// audio, and `single_utterance` is set to false, then no messages are streamed -// back to the client. -// -// Here's an example of a series of `StreamingRecognizeResponse`s that might be -// returned while processing audio: -// -// 1. results { alternatives { transcript: "tube" } stability: 0.01 } -// -// 2. results { alternatives { transcript: "to be a" } stability: 0.01 } -// -// 3. results { alternatives { transcript: "to be" } stability: 0.9 } -// results { alternatives { transcript: " or not to be" } stability: 0.01 } -// -// 4. results { alternatives { transcript: "to be or not to be" -// confidence: 0.92 } -// alternatives { transcript: "to bee or not to bee" } -// is_final: true } -// -// 5. results { alternatives { transcript: " that's" } stability: 0.01 } -// -// 6. results { alternatives { transcript: " that is" } stability: 0.9 } -// results { alternatives { transcript: " the question" } stability: 0.01 } -// -// 7. results { alternatives { transcript: " that is the question" -// confidence: 0.98 } -// alternatives { transcript: " that was the question" } -// is_final: true } -// -// Notes: -// -// - Only two of the above responses #4 and #7 contain final results; they are -// indicated by `is_final: true`. Concatenating these together generates the -// full transcript: "to be or not to be that is the question". -// -// - The others contain interim `results`. #3 and #6 contain two interim -// `results`: the first portion has a high stability and is less likely to -// change; the second portion has a low stability and is very likely to -// change. A UI designer might choose to show only high stability `results`. -// -// - The specific `stability` and `confidence` values shown above are only for -// illustrative purposes. Actual values may vary. -// -// - In each response, only one of these fields will be set: -// `error`, -// `speech_event_type`, or -// one or more (repeated) `results`. -message StreamingRecognizeResponse { - // Indicates the type of speech event. - enum SpeechEventType { - // No speech event specified. - SPEECH_EVENT_UNSPECIFIED = 0; - - // This event indicates that the server has detected the end of the user's - // speech utterance and expects no additional speech. Therefore, the server - // will not process additional audio (although it may subsequently return - // additional results). The client should stop sending additional audio - // data, half-close the gRPC connection, and wait for any additional results - // until the server closes the gRPC connection. This event is only sent if - // `single_utterance` was set to `true`, and is not used otherwise. - END_OF_SINGLE_UTTERANCE = 1; - } - - // If set, returns a [google.rpc.Status][google.rpc.Status] message that - // specifies the error for the operation. - google.rpc.Status error = 1; - - // This repeated list contains zero or more results that - // correspond to consecutive portions of the audio currently being processed. - // It contains zero or one `is_final=true` result (the newly settled portion), - // followed by zero or more `is_final=false` results (the interim results). - repeated StreamingRecognitionResult results = 2; - - // Indicates the type of speech event. - SpeechEventType speech_event_type = 4; -} - -// A streaming speech recognition result corresponding to a portion of the audio -// that is currently being processed. -message StreamingRecognitionResult { - // May contain one or more recognition hypotheses (up to the - // maximum specified in `max_alternatives`). - // These alternatives are ordered in terms of accuracy, with the top (first) - // alternative being the most probable, as ranked by the recognizer. - repeated SpeechRecognitionAlternative alternatives = 1; - - // If `false`, this `StreamingRecognitionResult` represents an - // interim result that may change. If `true`, this is the final time the - // speech service will return this particular `StreamingRecognitionResult`, - // the recognizer will not return any further hypotheses for this portion of - // the transcript and corresponding audio. - bool is_final = 2; - - // An estimate of the likelihood that the recognizer will not - // change its guess about this interim result. Values range from 0.0 - // (completely unstable) to 1.0 (completely stable). - // This field is only provided for interim results (`is_final=false`). - // The default of 0.0 is a sentinel value indicating `stability` was not set. - float stability = 3; - - // Time offset of the end of this result relative to the - // beginning of the audio. - google.protobuf.Duration result_end_time = 4; - - // For multi-channel audio, this is the channel number corresponding to the - // recognized result for the audio from that channel. - // For audio_channel_count = N, its output values can range from '1' to 'N'. - int32 channel_tag = 5; - - // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag - // of the language in this result. This language code was detected to have - // the most likelihood of being spoken in the audio. - string language_code = 6 [(google.api.field_behavior) = OUTPUT_ONLY]; -} - -// A speech recognition result corresponding to a portion of the audio. -message SpeechRecognitionResult { - // May contain one or more recognition hypotheses (up to the - // maximum specified in `max_alternatives`). - // These alternatives are ordered in terms of accuracy, with the top (first) - // alternative being the most probable, as ranked by the recognizer. - repeated SpeechRecognitionAlternative alternatives = 1; - - // For multi-channel audio, this is the channel number corresponding to the - // recognized result for the audio from that channel. - // For audio_channel_count = N, its output values can range from '1' to 'N'. - int32 channel_tag = 2; - - // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag - // of the language in this result. This language code was detected to have - // the most likelihood of being spoken in the audio. - string language_code = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; -} - -// Alternative hypotheses (a.k.a. n-best list). -message SpeechRecognitionAlternative { - // Transcript text representing the words that the user spoke. - string transcript = 1; - - // The confidence estimate between 0.0 and 1.0. A higher number - // indicates an estimated greater likelihood that the recognized words are - // correct. This field is set only for the top alternative of a non-streaming - // result or, of a streaming result where `is_final=true`. - // This field is not guaranteed to be accurate and users should not rely on it - // to be always provided. - // The default of 0.0 is a sentinel value indicating `confidence` was not set. - float confidence = 2; - - // A list of word-specific information for each recognized word. - // Note: When `enable_speaker_diarization` is true, you will see all the words - // from the beginning of the audio. - repeated WordInfo words = 3; -} - -// Word-specific information for recognized words. -message WordInfo { - // Time offset relative to the beginning of the audio, - // and corresponding to the start of the spoken word. - // This field is only set if `enable_word_time_offsets=true` and only - // in the top hypothesis. - // This is an experimental feature and the accuracy of the time offset can - // vary. - google.protobuf.Duration start_time = 1; - - // Time offset relative to the beginning of the audio, - // and corresponding to the end of the spoken word. - // This field is only set if `enable_word_time_offsets=true` and only - // in the top hypothesis. - // This is an experimental feature and the accuracy of the time offset can - // vary. - google.protobuf.Duration end_time = 2; - - // The word corresponding to this set of information. - string word = 3; - - // The confidence estimate between 0.0 and 1.0. A higher number - // indicates an estimated greater likelihood that the recognized words are - // correct. This field is set only for the top alternative of a non-streaming - // result or, of a streaming result where `is_final=true`. - // This field is not guaranteed to be accurate and users should not rely on it - // to be always provided. - // The default of 0.0 is a sentinel value indicating `confidence` was not set. - float confidence = 4; - - // Output only. A distinct integer value is assigned for every speaker within - // the audio. This field specifies which one of those speakers was detected to - // have spoken this word. Value ranges from '1' to diarization_speaker_count. - // speaker_tag is set if enable_speaker_diarization = 'true' and only in the - // top alternative. - int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY]; -} diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/cloud_speech_adaptation.proto b/packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/cloud_speech_adaptation.proto deleted file mode 100644 index 16789739d6e2..000000000000 --- a/packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/cloud_speech_adaptation.proto +++ /dev/null @@ -1,320 +0,0 @@ -// Copyright 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto3"; - -package google.cloud.speech.v1p1beta1; - -import "google/api/annotations.proto"; -import "google/api/client.proto"; -import "google/api/field_behavior.proto"; -import "google/api/resource.proto"; -import "google/cloud/speech/v1p1beta1/resource.proto"; -import "google/protobuf/empty.proto"; -import "google/protobuf/field_mask.proto"; - -option cc_enable_arenas = true; -option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1p1beta1;speech"; -option java_multiple_files = true; -option java_outer_classname = "SpeechAdaptationProto"; -option java_package = "com.google.cloud.speech.v1p1beta1"; -option objc_class_prefix = "GCS"; - -// Service that implements Google Cloud Speech Adaptation API. -service Adaptation { - option (google.api.default_host) = "speech.googleapis.com"; - option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; - - // Create a set of phrase hints. Each item in the set can be a single word or - // a multi-word phrase. The items in the PhraseSet are favored by the - // recognition model when you send a call that includes the PhraseSet. - rpc CreatePhraseSet(CreatePhraseSetRequest) returns (PhraseSet) { - option (google.api.http) = { - post: "/v1p1beta1/{parent=projects/*/locations/*}/phraseSets" - body: "*" - }; - option (google.api.method_signature) = "parent,phrase_set,phrase_set_id"; - } - - // Get a phrase set. - rpc GetPhraseSet(GetPhraseSetRequest) returns (PhraseSet) { - option (google.api.http) = { - get: "/v1p1beta1/{name=projects/*/locations/*/phraseSets/*}" - }; - option (google.api.method_signature) = "name"; - } - - // List phrase sets. - rpc ListPhraseSet(ListPhraseSetRequest) returns (ListPhraseSetResponse) { - option (google.api.http) = { - get: "/v1p1beta1/{parent=projects/*/locations/*}/phraseSets" - }; - option (google.api.method_signature) = "parent"; - } - - // Update a phrase set. - rpc UpdatePhraseSet(UpdatePhraseSetRequest) returns (PhraseSet) { - option (google.api.http) = { - patch: "/v1p1beta1/{phrase_set.name=projects/*/locations/*/phraseSets/*}" - body: "phrase_set" - }; - option (google.api.method_signature) = "phrase_set,update_mask"; - } - - // Delete a phrase set. - rpc DeletePhraseSet(DeletePhraseSetRequest) returns (google.protobuf.Empty) { - option (google.api.http) = { - delete: "/v1p1beta1/{name=projects/*/locations/*/phraseSets/*}" - }; - option (google.api.method_signature) = "name"; - } - - // Create a custom class. - rpc CreateCustomClass(CreateCustomClassRequest) returns (CustomClass) { - option (google.api.http) = { - post: "/v1p1beta1/{parent=projects/*/locations/*}/customClasses" - body: "*" - }; - option (google.api.method_signature) = "parent,custom_class,custom_class_id"; - } - - // Get a custom class. - rpc GetCustomClass(GetCustomClassRequest) returns (CustomClass) { - option (google.api.http) = { - get: "/v1p1beta1/{name=projects/*/locations/*/customClasses/*}" - }; - option (google.api.method_signature) = "name"; - } - - // List custom classes. - rpc ListCustomClasses(ListCustomClassesRequest) returns (ListCustomClassesResponse) { - option (google.api.http) = { - get: "/v1p1beta1/{parent=projects/*/locations/*}/customClasses" - }; - option (google.api.method_signature) = "parent"; - } - - // Update a custom class. - rpc UpdateCustomClass(UpdateCustomClassRequest) returns (CustomClass) { - option (google.api.http) = { - patch: "/v1p1beta1/{custom_class.name=projects/*/locations/*/customClasses/*}" - body: "custom_class" - }; - option (google.api.method_signature) = "custom_class,update_mask"; - } - - // Delete a custom class. - rpc DeleteCustomClass(DeleteCustomClassRequest) returns (google.protobuf.Empty) { - option (google.api.http) = { - delete: "/v1p1beta1/{name=projects/*/locations/*/customClasses/*}" - }; - option (google.api.method_signature) = "name"; - } -} - -// Message sent by the client for the `CreatePhraseSet` method. -message CreatePhraseSetRequest { - // Required. The parent resource where this phrase set will be created. - // Format: - // {api_version}/projects/{project}/locations/{location}/phraseSets - string parent = 1 [ - (google.api.field_behavior) = REQUIRED, - (google.api.resource_reference) = { - child_type: "speech.googleapis.com/PhraseSet" - } - ]; - - // The ID to use for the phrase set, which will become the final - // component of the phrase set's resource name. - // - // This value should be 4-63 characters, and valid characters - // are /[a-z][0-9]-/. - string phrase_set_id = 2; - - // Required. The phrase set to create. - PhraseSet phrase_set = 3 [(google.api.field_behavior) = REQUIRED]; -} - -// Message sent by the client for the `UpdatePhraseSet` method. -message UpdatePhraseSetRequest { - // Required. The phrase set to update. - // - // The phrase set's `name` field is used to identify the set to be - // updated. Format: - // {api_version}/projects/{project}/locations/{location}/phraseSets/{phrase_set} - PhraseSet phrase_set = 1 [(google.api.field_behavior) = REQUIRED]; - - // The list of fields to be updated. - google.protobuf.FieldMask update_mask = 2; -} - -// Message sent by the client for the `GetPhraseSet` method. -message GetPhraseSetRequest { - // Required. The name of the phrase set to retrieve. - // Format: - // {api_version}/projects/{project}/locations/{location}/phraseSets/{phrase_set} - string name = 1 [ - (google.api.field_behavior) = REQUIRED, - (google.api.resource_reference) = { - type: "speech.googleapis.com/PhraseSet" - } - ]; -} - -// Message sent by the client for the `ListPhraseSet` method. -message ListPhraseSetRequest { - // Required. The parent, which owns this collection of phrase set. - // Format: - // projects/{project}/locations/{location} - string parent = 1 [ - (google.api.field_behavior) = REQUIRED, - (google.api.resource_reference) = { - child_type: "speech.googleapis.com/PhraseSet" - } - ]; - - // The maximum number of phrase sets to return. The service may return - // fewer than this value. If unspecified, at most 50 phrase sets will be - // returned. The maximum value is 1000; values above 1000 will be coerced to - // 1000. - int32 page_size = 2; - - // A page token, received from a previous `ListPhraseSet` call. - // Provide this to retrieve the subsequent page. - // - // When paginating, all other parameters provided to `ListPhraseSet` must - // match the call that provided the page token. - string page_token = 3; -} - -// Message returned to the client by the `ListPhraseSet` method. -message ListPhraseSetResponse { - // The phrase set. - repeated PhraseSet phrase_sets = 1; - - // A token, which can be sent as `page_token` to retrieve the next page. - // If this field is omitted, there are no subsequent pages. - string next_page_token = 2; -} - -// Message sent by the client for the `DeletePhraseSet` method. -message DeletePhraseSetRequest { - // Required. The name of the phrase set to delete. - // Format: - // {api_version}/projects/{project}/locations/{location}/phraseSets/{phrase_set} - string name = 1 [ - (google.api.field_behavior) = REQUIRED, - (google.api.resource_reference) = { - type: "speech.googleapis.com/PhraseSet" - } - ]; -} - -// Message sent by the client for the `CreateCustomClass` method. -message CreateCustomClassRequest { - // Required. The parent resource where this custom class will be created. - // Format: - // {api_version}/projects/{project}/locations/{location}/customClasses - string parent = 1 [ - (google.api.field_behavior) = REQUIRED, - (google.api.resource_reference) = { - child_type: "speech.googleapis.com/CustomClass" - } - ]; - - // The ID to use for the custom class, which will become the final - // component of the custom class' resource name. - // - // This value should be 4-63 characters, and valid characters - // are /[a-z][0-9]-/. - string custom_class_id = 2; - - // Required. The custom class to create. - CustomClass custom_class = 3 [(google.api.field_behavior) = REQUIRED]; -} - -// Message sent by the client for the `UpdateCustomClass` method. -message UpdateCustomClassRequest { - // Required. The custom class to update. - // - // The custom class's `name` field is used to identify the custom class to be - // updated. Format: - // {api_version}/projects/{project}/locations/{location}/customClasses/{custom_class} - CustomClass custom_class = 1 [(google.api.field_behavior) = REQUIRED]; - - // The list of fields to be updated. - google.protobuf.FieldMask update_mask = 2; -} - -// Message sent by the client for the `GetCustomClass` method. -message GetCustomClassRequest { - // Required. The name of the custom class to retrieve. - // Format: - // {api_version}/projects/{project}/locations/{location}/customClasses/{custom_class} - string name = 1 [ - (google.api.field_behavior) = REQUIRED, - (google.api.resource_reference) = { - type: "speech.googleapis.com/CustomClass" - } - ]; -} - -// Message sent by the client for the `ListCustomClasses` method. -message ListCustomClassesRequest { - // Required. The parent, which owns this collection of custom classes. - // Format: - // {api_version}/projects/{project}/locations/{location}/customClasses - string parent = 1 [ - (google.api.field_behavior) = REQUIRED, - (google.api.resource_reference) = { - child_type: "speech.googleapis.com/CustomClass" - } - ]; - - // The maximum number of custom classes to return. The service may return - // fewer than this value. If unspecified, at most 50 custom classes will be - // returned. The maximum value is 1000; values above 1000 will be coerced to - // 1000. - int32 page_size = 2; - - // A page token, received from a previous `ListCustomClass` call. - // Provide this to retrieve the subsequent page. - // - // When paginating, all other parameters provided to `ListCustomClass` must - // match the call that provided the page token. - string page_token = 3; -} - -// Message returned to the client by the `ListCustomClasses` method. -message ListCustomClassesResponse { - // The custom classes. - repeated CustomClass custom_classes = 1; - - // A token, which can be sent as `page_token` to retrieve the next page. - // If this field is omitted, there are no subsequent pages. - string next_page_token = 2; -} - -// Message sent by the client for the `DeleteCustomClass` method. -message DeleteCustomClassRequest { - // Required. The name of the custom class to delete. - // Format: - // {api_version}/projects/{project}/locations/{location}/customClasses/{custom_class} - string name = 1 [ - (google.api.field_behavior) = REQUIRED, - (google.api.resource_reference) = { - type: "speech.googleapis.com/CustomClass" - } - ]; -} diff --git a/packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/resource.proto b/packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/resource.proto deleted file mode 100644 index 74cee0647d3f..000000000000 --- a/packages/google-cloud-python-speech/google/cloud/speech_v1p1beta1/proto/resource.proto +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2021 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto3"; - -package google.cloud.speech.v1p1beta1; - -import "google/api/resource.proto"; -import "google/api/annotations.proto"; - -option cc_enable_arenas = true; -option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1p1beta1;speech"; -option java_multiple_files = true; -option java_outer_classname = "SpeechResourceProto"; -option java_package = "com.google.cloud.speech.v1p1beta1"; -option objc_class_prefix = "GCS"; - -// A set of words or phrases that represents a common concept likely to appear -// in your audio, for example a list of passenger ship names. CustomClass items -// can be substituted into placeholders that you set in PhraseSet phrases. -message CustomClass { - option (google.api.resource) = { - type: "speech.googleapis.com/CustomClass" - pattern: "projects/{project}/locations/{location}/customClasses/{custom_class}" - }; - - // An item of the class. - message ClassItem { - // The class item's value. - string value = 1; - } - - // The resource name of the custom class. - string name = 1; - - // If this custom class is a resource, the custom_class_id is the resource id - // of the CustomClass. Case sensitive. - string custom_class_id = 2; - - // A collection of class items. - repeated ClassItem items = 3; -} - -// Provides "hints" to the speech recognizer to favor specific words and phrases -// in the results. -message PhraseSet { - option (google.api.resource) = { - type: "speech.googleapis.com/PhraseSet" - pattern: "projects/{project}/locations/{location}/phraseSets/{phrase_set}" - }; - - // A phrases containing words and phrase "hints" so that - // the speech recognition is more likely to recognize them. This can be used - // to improve the accuracy for specific words and phrases, for example, if - // specific commands are typically spoken by the user. This can also be used - // to add additional words to the vocabulary of the recognizer. See - // [usage limits](https://cloud.google.com/speech-to-text/quotas#content). - // - // List items can also include pre-built or custom classes containing groups - // of words that represent common concepts that occur in natural language. For - // example, rather than providing a phrase hint for every month of the - // year (e.g. "i was born in january", "i was born in febuary", ...), use the - // pre-built `$MONTH` class improves the likelihood of correctly transcribing - // audio that includes months (e.g. "i was born in $month"). - // To refer to pre-built classes, use the class' symbol prepended with `$` - // e.g. `$MONTH`. To refer to custom classes that were defined inline in the - // request, set the class's `custom_class_id` to a string unique to all class - // resources and inline classes. Then use the class' id wrapped in $`{...}` - // e.g. "${my-months}". To refer to custom classes resources, use the class' - // id wrapped in `${}` (e.g. `${my-months}`). - message Phrase { - // The phrase itself. - string value = 1; - - // Hint Boost. Overrides the boost set at the phrase set level. - // Positive value will increase the probability that a specific phrase will - // be recognized over other similar sounding phrases. The higher the boost, - // the higher the chance of false positive recognition as well. Negative - // boost values would correspond to anti-biasing. Anti-biasing is not - // enabled, so negative boost will simply be ignored. Though `boost` can - // accept a wide range of positive values, most use cases are best served - // with values between 0 and 20. We recommend using a binary search approach - // to finding the optimal value for your use case. Speech recognition - // will skip PhraseSets with a boost value of 0. - float boost = 2; - } - - // The resource name of the phrase set. - string name = 1; - - // A list of word and phrases. - repeated Phrase phrases = 2; - - // Hint Boost. Positive value will increase the probability that a specific - // phrase will be recognized over other similar sounding phrases. The higher - // the boost, the higher the chance of false positive recognition as well. - // Negative boost values would correspond to anti-biasing. Anti-biasing is not - // enabled, so negative boost will simply be ignored. Though `boost` can - // accept a wide range of positive values, most use cases are best served with - // values between 0 (exclusive) and 20. We recommend using a binary search - // approach to finding the optimal value for your use case. Speech recognition - // will skip PhraseSets with a boost value of 0. - float boost = 4; -} - -// Speech adaptation configuration. -message SpeechAdaptation { - // A collection of phrase sets. To specify the hints inline, leave the - // phrase set's `name` blank and fill in the rest of its fields. Any - // phrase set can use any custom class. - repeated PhraseSet phrase_sets = 1; - - // A collection of phrase set resource names to use. - repeated string phrase_set_references = 2; - - // A collection of custom classes. To specify the classes inline, leave the - // class' `name` blank and fill in the rest of its fields, giving it a unique - // `custom_class_id`. Refer to the inline defined class in phrase hints by its - // `custom_class_id`. - repeated CustomClass custom_classes = 3; -}