| // Copyright (C) 2017 The Android Open Source Project |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Text classification model configuration. |
| |
| syntax = "proto2"; |
| option optimize_for = LITE_RUNTIME; |
| |
| import "external/libtextclassifier/common/embedding-network.proto"; |
| import "external/libtextclassifier/smartselect/tokenizer.proto"; |
| |
| package libtextclassifier; |
| |
| // Generic options of a model, non-specific to selection or sharing. |
| message ModelOptions { |
| // If true, will use embeddings from a different model. This is mainly useful |
| // for the Sharing model using the embeddings from the Selection model. |
| optional bool use_shared_embeddings = 1; |
| |
| // Language of the model. |
| optional string language = 2; |
| |
| // Version of the model. |
| optional int32 version = 3; |
| } |
| |
| message SelectionModelOptions { |
| // A list of Unicode codepoints to strip from predicted selections. |
| repeated int32 deprecated_punctuation_to_strip = 1; |
| |
| // Enforce symmetrical selections. |
| optional bool enforce_symmetry = 3; |
| |
| // Number of inferences made around the click position (to one side), for |
| // enforcing symmetry. |
| optional int32 symmetry_context_size = 4; |
| |
| reserved 2; |
| } |
| |
| message SharingModelOptions { |
| // If true, will always return "url" when the url hint is passed in. |
| optional bool always_accept_url_hint = 1; |
| |
| // If true, will always return "email" when the e-mail hint is passed in. |
| optional bool always_accept_email_hint = 2; |
| |
| // Limits for phone numbers. |
| optional int32 phone_min_num_digits = 3 [default = 7]; |
| optional int32 phone_max_num_digits = 4 [default = 15]; |
| |
| // List of regular expression matchers to check. |
| message RegexPattern { |
| // The name of the collection of a match. |
| optional string collection_name = 1; |
| |
| // The pattern to check. |
| optional string pattern = 2; |
| } |
| repeated RegexPattern regex_pattern = 5; |
| } |
| |
| // Next ID: 39 |
| message FeatureProcessorOptions { |
| // Number of buckets used for hashing charactergrams. |
| optional int32 num_buckets = 1 [default = -1]; |
| |
| // Context size defines the number of words to the left and to the right of |
| // the selected word to be used as context. For example, if context size is |
| // N, then we take N words to the left and N words to the right of the |
| // selected word as its context. |
| optional int32 context_size = 2 [default = -1]; |
| |
| // Maximum number of words of the context to select in total. |
| optional int32 max_selection_span = 3 [default = -1]; |
| |
| // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 |
| // character trigrams etc. |
| repeated int32 chargram_orders = 4; |
| |
| // Maximum length of a word, in codepoints. |
| optional int32 max_word_length = 21 [default = 20]; |
| |
| // If true, will use the unicode-aware functionality for extracting features. |
| optional bool unicode_aware_features = 19 [default = false]; |
| |
| // Whether to extract the token case feature. |
| optional bool extract_case_feature = 5 [default = false]; |
| |
| // Whether to extract the selection mask feature. |
| optional bool extract_selection_mask_feature = 6 [default = false]; |
| |
| // List of regexps to run over each token. For each regexp, if there is a |
| // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. |
| repeated string regexp_feature = 22; |
| |
| // Whether to remap all digits to a single number. |
| optional bool remap_digits = 20 [default = false]; |
| |
| // Whether to lower-case each token before generating hashgrams. |
| optional bool lowercase_tokens = 33; |
| |
| // If true, the selection classifier output will contain only the selections |
| // that are feasible (e.g., those that are shorter than max_selection_span), |
| // if false, the output will be a complete cross-product of possible |
| // selections to the left and posible selections to the right, including the |
| // infeasible ones. |
| // NOTE: Exists mainly for compatibility with older models that were trained |
| // with the non-reduced output space. |
| optional bool selection_reduced_output_space = 8 [default = true]; |
| |
| // Collection names. |
| repeated string collections = 9; |
| |
| // An index of collection in collections to be used if a collection name can't |
| // be mapped to an id. |
| optional int32 default_collection = 10 [default = -1]; |
| |
| // If true, will split the input by lines, and only use the line that contains |
| // the clicked token. |
| optional bool only_use_line_with_click = 13 [default = false]; |
| |
| // If true, will split tokens that contain the selection boundary, at the |
| // position of the boundary. |
| // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" |
| optional bool split_tokens_on_selection_boundaries = 14 [default = false]; |
| |
| // Codepoint ranges that determine how different codepoints are tokenized. |
| // The ranges must not overlap. |
| repeated TokenizationCodepointRange tokenization_codepoint_config = 15; |
| |
| // Method for selecting the center token. |
| enum CenterTokenSelectionMethod { |
| DEFAULT_CENTER_TOKEN_METHOD = 0; // Invalid option. |
| |
| // Use click indices to determine the center token. |
| CENTER_TOKEN_FROM_CLICK = 1; |
| |
| // Use selection indices to get a token range, and select the middle of it |
| // as the center token. |
| CENTER_TOKEN_MIDDLE_OF_SELECTION = 2; |
| } |
| optional CenterTokenSelectionMethod center_token_selection_method = 16; |
| |
| // If true, span boundaries will be snapped to containing tokens and not |
| // required to exactly match token boundaries. |
| optional bool snap_label_span_boundaries_to_containing_tokens = 18; |
| |
| // Range of codepoints start - end, where end is exclusive. |
| message CodepointRange { |
| optional int32 start = 1; |
| optional int32 end = 2; |
| } |
| |
| // A set of codepoint ranges supported by the model. |
| repeated CodepointRange supported_codepoint_ranges = 23; |
| |
| // A set of codepoint ranges to use in the mixed tokenization mode to identify |
| // stretches of tokens to re-tokenize using the internal tokenizer. |
| repeated CodepointRange internal_tokenizer_codepoint_ranges = 34; |
| |
| // Minimum ratio of supported codepoints in the input context. If the ratio |
| // is lower than this, the feature computation will fail. |
| optional float min_supported_codepoint_ratio = 24 [default = 0.0]; |
| |
| // Used for versioning the format of features the model expects. |
| // - feature_version == 0: |
| // For each token the features consist of: |
| // - chargram embeddings |
| // - dense features |
| // Chargram embeddings for tokens are concatenated first together, |
| // and at the end, the dense features for the tokens are concatenated |
| // to it. So the resulting feature vector has two regions. |
| optional int32 feature_version = 25 [default = 0]; |
| |
| // Controls the type of tokenization the model will use for the input text. |
| enum TokenizationType { |
| INVALID_TOKENIZATION_TYPE = 0; |
| |
| // Use the internal tokenizer for tokenization. |
| INTERNAL_TOKENIZER = 1; |
| |
| // Use ICU for tokenization. |
| ICU = 2; |
| |
| // First apply ICU tokenization. Then identify stretches of tokens |
| // consisting only of codepoints in internal_tokenizer_codepoint_ranges |
| // and re-tokenize them using the internal tokenizer. |
| MIXED = 3; |
| } |
| optional TokenizationType tokenization_type = 30 |
| [default = INTERNAL_TOKENIZER]; |
| optional bool icu_preserve_whitespace_tokens = 31 [default = false]; |
| |
| // List of codepoints that will be stripped from beginning and end of |
| // predicted spans. |
| repeated int32 ignored_span_boundary_codepoints = 36; |
| |
| reserved 7, 11, 12, 26, 27, 28, 29, 32, 35; |
| |
| // List of allowed charactergrams. The extracted charactergrams are filtered |
| // using this list, and charactergrams that are not present are interpreted as |
| // out-of-vocabulary. |
| // If no allowed_chargrams are specified, all charactergrams are allowed. |
| // The field is typed as bytes type to allow non-UTF8 chargrams. |
| repeated bytes allowed_chargrams = 38; |
| }; |
| |
| extend nlp_core.EmbeddingNetworkProto { |
| optional ModelOptions model_options_in_embedding_network_proto = 150063045; |
| optional FeatureProcessorOptions |
| feature_processor_options_in_embedding_network_proto = 146230910; |
| optional SelectionModelOptions |
| selection_model_options_in_embedding_network_proto = 148190899; |
| optional SharingModelOptions |
| sharing_model_options_in_embedding_network_proto = 151445439; |
| } |