smartselect/text-classification-model.proto - platform/external/libtextclassifier - Gitiles

 // Copyright (C) 2017 The Android Open Source Project
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Text classification model configuration.

 syntax = "proto2";
 option optimize_for = LITE_RUNTIME;

 import "external/libtextclassifier/common/embedding-network.proto";
 import "external/libtextclassifier/smartselect/tokenizer.proto";

 package libtextclassifier;

 // Generic options of a model, non-specific to selection or sharing.
 message ModelOptions {
   // If true, will use embeddings from a different model. This is mainly useful
   // for the Sharing model using the embeddings from the Selection model.
   optional bool use_shared_embeddings = 1;

   // Language of the model.
   optional string language = 2;

   // Version of the model.
   optional int32 version = 3;
 }

 message SelectionModelOptions {
   // A list of Unicode codepoints to strip from predicted selections.
   repeated int32 deprecated_punctuation_to_strip = 1;

   // Enforce symmetrical selections.
   optional bool enforce_symmetry = 3;

   // Number of inferences made around the click position (to one side), for
   // enforcing symmetry.
   optional int32 symmetry_context_size = 4;

   reserved 2;
 }

 message SharingModelOptions {
   // If true, will always return "url" when the url hint is passed in.
   optional bool always_accept_url_hint = 1;

   // If true, will always return "email" when the e-mail hint is passed in.
   optional bool always_accept_email_hint = 2;

   // Limits for phone numbers.
   optional int32 phone_min_num_digits = 3 [default = 7];
   optional int32 phone_max_num_digits = 4 [default = 15];

   // List of regular expression matchers to check.
   message RegexPattern {
     // The name of the collection of a match.
     optional string collection_name = 1;

     // The pattern to check.
     optional string pattern = 2;
   }
   repeated RegexPattern regex_pattern = 5;
 }

 // Next ID: 39
 message FeatureProcessorOptions {
   // Number of buckets used for hashing charactergrams.
   optional int32 num_buckets = 1 [default = -1];

   // Context size defines the number of words to the left and to the right of
   // the selected word to be used as context. For example, if context size is
   // N, then we take N words to the left and N words to the right of the
   // selected word as its context.
   optional int32 context_size = 2 [default = -1];

   // Maximum number of words of the context to select in total.
   optional int32 max_selection_span = 3 [default = -1];

   // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
   // character trigrams etc.
   repeated int32 chargram_orders = 4;

   // Maximum length of a word, in codepoints.
   optional int32 max_word_length = 21 [default = 20];

   // If true, will use the unicode-aware functionality for extracting features.
   optional bool unicode_aware_features = 19 [default = false];

   // Whether to extract the token case feature.
   optional bool extract_case_feature = 5 [default = false];

   // Whether to extract the selection mask feature.
   optional bool extract_selection_mask_feature = 6 [default = false];

   // List of regexps to run over each token. For each regexp, if there is a
   // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
   repeated string regexp_feature = 22;

   // Whether to remap all digits to a single number.
   optional bool remap_digits = 20 [default = false];

   // Whether to lower-case each token before generating hashgrams.
   optional bool lowercase_tokens = 33;

   // If true, the selection classifier output will contain only the selections
   // that are feasible (e.g., those that are shorter than max_selection_span),
   // if false, the output will be a complete cross-product of possible
   // selections to the left and posible selections to the right, including the
   // infeasible ones.
   // NOTE: Exists mainly for compatibility with older models that were trained
   // with the non-reduced output space.
   optional bool selection_reduced_output_space = 8 [default = true];

   // Collection names.
   repeated string collections = 9;

   // An index of collection in collections to be used if a collection name can't
   // be mapped to an id.
   optional int32 default_collection = 10 [default = -1];

   // If true, will split the input by lines, and only use the line that contains
   // the clicked token.
   optional bool only_use_line_with_click = 13 [default = false];

   // If true, will split tokens that contain the selection boundary, at the
   // position of the boundary.
   // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
   optional bool split_tokens_on_selection_boundaries = 14 [default = false];

   // Codepoint ranges that determine how different codepoints are tokenized.
   // The ranges must not overlap.
   repeated TokenizationCodepointRange tokenization_codepoint_config = 15;

   // Method for selecting the center token.
   enum CenterTokenSelectionMethod {
     DEFAULT_CENTER_TOKEN_METHOD = 0;  // Invalid option.

     // Use click indices to determine the center token.
     CENTER_TOKEN_FROM_CLICK = 1;

     // Use selection indices to get a token range, and select the middle of it
     // as the center token.
     CENTER_TOKEN_MIDDLE_OF_SELECTION = 2;
   }
   optional CenterTokenSelectionMethod center_token_selection_method = 16;

   // If true, span boundaries will be snapped to containing tokens and not
   // required to exactly match token boundaries.
   optional bool snap_label_span_boundaries_to_containing_tokens = 18;

   // Range of codepoints start - end, where end is exclusive.
   message CodepointRange {
     optional int32 start = 1;
     optional int32 end = 2;
   }

   // A set of codepoint ranges supported by the model.
   repeated CodepointRange supported_codepoint_ranges = 23;

   // A set of codepoint ranges to use in the mixed tokenization mode to identify
   // stretches of tokens to re-tokenize using the internal tokenizer.
   repeated CodepointRange internal_tokenizer_codepoint_ranges = 34;

   // Minimum ratio of supported codepoints in the input context. If the ratio
   // is lower than this, the feature computation will fail.
   optional float min_supported_codepoint_ratio = 24 [default = 0.0];

   // Used for versioning the format of features the model expects.
   //  - feature_version == 0:
   //      For each token the features consist of:
   //       - chargram embeddings
   //       - dense features
   //      Chargram embeddings for tokens are concatenated first together,
   //      and at the end, the dense features for the tokens are concatenated
   //      to it. So the resulting feature vector has two regions.
   optional int32 feature_version = 25 [default = 0];

   // Controls the type of tokenization the model will use for the input text.
   enum TokenizationType {
     INVALID_TOKENIZATION_TYPE = 0;

     // Use the internal tokenizer for tokenization.
     INTERNAL_TOKENIZER = 1;

     // Use ICU for tokenization.
     ICU = 2;

     // First apply ICU tokenization. Then identify stretches of tokens
     // consisting only of codepoints in internal_tokenizer_codepoint_ranges
     // and re-tokenize them using the internal tokenizer.
     MIXED = 3;
   }
   optional TokenizationType tokenization_type = 30
       [default = INTERNAL_TOKENIZER];
   optional bool icu_preserve_whitespace_tokens = 31 [default = false];

   // List of codepoints that will be stripped from beginning and end of
   // predicted spans.
   repeated int32 ignored_span_boundary_codepoints = 36;

   reserved 7, 11, 12, 26, 27, 28, 29, 32, 35;

   // List of allowed charactergrams. The extracted charactergrams are filtered
   // using this list, and charactergrams that are not present are interpreted as
   // out-of-vocabulary.
   // If no allowed_chargrams are specified, all charactergrams are allowed.
   // The field is typed as bytes type to allow non-UTF8 chargrams.
   repeated bytes allowed_chargrams = 38;
 };

 extend nlp_core.EmbeddingNetworkProto {
   optional ModelOptions model_options_in_embedding_network_proto = 150063045;
   optional FeatureProcessorOptions
       feature_processor_options_in_embedding_network_proto = 146230910;
   optional SelectionModelOptions
       selection_model_options_in_embedding_network_proto = 148190899;
   optional SharingModelOptions
       sharing_model_options_in_embedding_network_proto = 151445439;
 }
	// Copyright (C) 2017 The Android Open Source Project
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// Text classification model configuration.

	syntax = "proto2";
	option optimize_for = LITE_RUNTIME;

	import "external/libtextclassifier/common/embedding-network.proto";
	import "external/libtextclassifier/smartselect/tokenizer.proto";

	package libtextclassifier;

	// Generic options of a model, non-specific to selection or sharing.
	message ModelOptions {
	// If true, will use embeddings from a different model. This is mainly useful
	// for the Sharing model using the embeddings from the Selection model.
	optional bool use_shared_embeddings = 1;

	// Language of the model.
	optional string language = 2;

	// Version of the model.
	optional int32 version = 3;
	}

	message SelectionModelOptions {
	// A list of Unicode codepoints to strip from predicted selections.
	repeated int32 deprecated_punctuation_to_strip = 1;

	// Enforce symmetrical selections.
	optional bool enforce_symmetry = 3;

	// Number of inferences made around the click position (to one side), for
	// enforcing symmetry.
	optional int32 symmetry_context_size = 4;

	reserved 2;
	}

	message SharingModelOptions {
	// If true, will always return "url" when the url hint is passed in.
	optional bool always_accept_url_hint = 1;

	// If true, will always return "email" when the e-mail hint is passed in.
	optional bool always_accept_email_hint = 2;

	// Limits for phone numbers.
	optional int32 phone_min_num_digits = 3 [default = 7];
	optional int32 phone_max_num_digits = 4 [default = 15];

	// List of regular expression matchers to check.
	message RegexPattern {
	// The name of the collection of a match.
	optional string collection_name = 1;

	// The pattern to check.
	optional string pattern = 2;
	}
	repeated RegexPattern regex_pattern = 5;
	}

	// Next ID: 39
	message FeatureProcessorOptions {
	// Number of buckets used for hashing charactergrams.
	optional int32 num_buckets = 1 [default = -1];

	// Context size defines the number of words to the left and to the right of
	// the selected word to be used as context. For example, if context size is
	// N, then we take N words to the left and N words to the right of the
	// selected word as its context.
	optional int32 context_size = 2 [default = -1];

	// Maximum number of words of the context to select in total.
	optional int32 max_selection_span = 3 [default = -1];

	// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
	// character trigrams etc.
	repeated int32 chargram_orders = 4;

	// Maximum length of a word, in codepoints.
	optional int32 max_word_length = 21 [default = 20];

	// If true, will use the unicode-aware functionality for extracting features.
	optional bool unicode_aware_features = 19 [default = false];

	// Whether to extract the token case feature.
	optional bool extract_case_feature = 5 [default = false];

	// Whether to extract the selection mask feature.
	optional bool extract_selection_mask_feature = 6 [default = false];

	// List of regexps to run over each token. For each regexp, if there is a
	// match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
	repeated string regexp_feature = 22;

	// Whether to remap all digits to a single number.
	optional bool remap_digits = 20 [default = false];

	// Whether to lower-case each token before generating hashgrams.
	optional bool lowercase_tokens = 33;

	// If true, the selection classifier output will contain only the selections
	// that are feasible (e.g., those that are shorter than max_selection_span),
	// if false, the output will be a complete cross-product of possible
	// selections to the left and posible selections to the right, including the
	// infeasible ones.
	// NOTE: Exists mainly for compatibility with older models that were trained
	// with the non-reduced output space.
	optional bool selection_reduced_output_space = 8 [default = true];

	// Collection names.
	repeated string collections = 9;

	// An index of collection in collections to be used if a collection name can't
	// be mapped to an id.
	optional int32 default_collection = 10 [default = -1];

	// If true, will split the input by lines, and only use the line that contains
	// the clicked token.
	optional bool only_use_line_with_click = 13 [default = false];

	// If true, will split tokens that contain the selection boundary, at the
	// position of the boundary.
	// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
	optional bool split_tokens_on_selection_boundaries = 14 [default = false];

	// Codepoint ranges that determine how different codepoints are tokenized.
	// The ranges must not overlap.
	repeated TokenizationCodepointRange tokenization_codepoint_config = 15;

	// Method for selecting the center token.
	enum CenterTokenSelectionMethod {
	DEFAULT_CENTER_TOKEN_METHOD = 0; // Invalid option.

	// Use click indices to determine the center token.
	CENTER_TOKEN_FROM_CLICK = 1;

	// Use selection indices to get a token range, and select the middle of it
	// as the center token.
	CENTER_TOKEN_MIDDLE_OF_SELECTION = 2;
	}
	optional CenterTokenSelectionMethod center_token_selection_method = 16;

	// If true, span boundaries will be snapped to containing tokens and not
	// required to exactly match token boundaries.
	optional bool snap_label_span_boundaries_to_containing_tokens = 18;

	// Range of codepoints start - end, where end is exclusive.
	message CodepointRange {
	optional int32 start = 1;
	optional int32 end = 2;
	}

	// A set of codepoint ranges supported by the model.
	repeated CodepointRange supported_codepoint_ranges = 23;

	// A set of codepoint ranges to use in the mixed tokenization mode to identify
	// stretches of tokens to re-tokenize using the internal tokenizer.
	repeated CodepointRange internal_tokenizer_codepoint_ranges = 34;

	// Minimum ratio of supported codepoints in the input context. If the ratio
	// is lower than this, the feature computation will fail.
	optional float min_supported_codepoint_ratio = 24 [default = 0.0];

	// Used for versioning the format of features the model expects.
	// - feature_version == 0:
	// For each token the features consist of:
	// - chargram embeddings
	// - dense features
	// Chargram embeddings for tokens are concatenated first together,
	// and at the end, the dense features for the tokens are concatenated
	// to it. So the resulting feature vector has two regions.
	optional int32 feature_version = 25 [default = 0];

	// Controls the type of tokenization the model will use for the input text.
	enum TokenizationType {
	INVALID_TOKENIZATION_TYPE = 0;

	// Use the internal tokenizer for tokenization.
	INTERNAL_TOKENIZER = 1;

	// Use ICU for tokenization.
	ICU = 2;

	// First apply ICU tokenization. Then identify stretches of tokens
	// consisting only of codepoints in internal_tokenizer_codepoint_ranges
	// and re-tokenize them using the internal tokenizer.
	MIXED = 3;
	}
	optional TokenizationType tokenization_type = 30
	[default = INTERNAL_TOKENIZER];
	optional bool icu_preserve_whitespace_tokens = 31 [default = false];

	// List of codepoints that will be stripped from beginning and end of
	// predicted spans.
	repeated int32 ignored_span_boundary_codepoints = 36;

	reserved 7, 11, 12, 26, 27, 28, 29, 32, 35;

	// List of allowed charactergrams. The extracted charactergrams are filtered
	// using this list, and charactergrams that are not present are interpreted as
	// out-of-vocabulary.
	// If no allowed_chargrams are specified, all charactergrams are allowed.
	// The field is typed as bytes type to allow non-UTF8 chargrams.
	repeated bytes allowed_chargrams = 38;
	};

	extend nlp_core.EmbeddingNetworkProto {
	optional ModelOptions model_options_in_embedding_network_proto = 150063045;
	optional FeatureProcessorOptions
	feature_processor_options_in_embedding_network_proto = 146230910;
	optional SelectionModelOptions
	selection_model_options_in_embedding_network_proto = 148190899;
	optional SharingModelOptions
	sharing_model_options_in_embedding_network_proto = 151445439;
	}