Blame - model.fbs - platform/external/libtextclassifier

blob: 2d69fe761dc421c4b9d5dcf6a3225c3158355532 [file] [log] [blame]

Lukas Zilka	df710db	2018-02-27 12:44:09 +0100	[diff] [blame^]	1	//
				2	// Copyright (C) 2017 The Android Open Source Project
				3	//
				4	// Licensed under the Apache License, Version 2.0 (the "License");
				5	// you may not use this file except in compliance with the License.
				6	// You may obtain a copy of the License at
				7	//
				8	// http://www.apache.org/licenses/LICENSE-2.0
				9	//
				10	// Unless required by applicable law or agreed to in writing, software
				11	// distributed under the License is distributed on an "AS IS" BASIS,
				12	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	// See the License for the specific language governing permissions and
				14	// limitations under the License.
				15	//
				16
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	17	file_identifier "TC2 ";
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	18
				19	namespace libtextclassifier2;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	20	enum DatetimeExtractorType : int {
				21	UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
				22	AM = 1,
				23	PM = 2,
				24	JANUARY = 3,
				25	FEBRUARY = 4,
				26	MARCH = 5,
				27	APRIL = 6,
				28	MAY = 7,
				29	JUNE = 8,
				30	JULY = 9,
				31	AUGUST = 10,
				32	SEPTEMBER = 11,
				33	OCTOBER = 12,
				34	NOVEMBER = 13,
				35	DECEMBER = 14,
				36	NEXT = 15,
				37	NEXT_OR_SAME = 16,
				38	LAST = 17,
				39	NOW = 18,
				40	TOMORROW = 19,
				41	YESTERDAY = 20,
				42	PAST = 21,
				43	FUTURE = 22,
				44	DAY = 23,
				45	WEEK = 24,
				46	MONTH = 25,
				47	YEAR = 26,
				48	MONDAY = 27,
				49	TUESDAY = 28,
				50	WEDNESDAY = 29,
				51	THURSDAY = 30,
				52	FRIDAY = 31,
				53	SATURDAY = 32,
				54	SUNDAY = 33,
				55	DAYS = 34,
				56	WEEKS = 35,
				57	MONTHS = 36,
				58	HOURS = 37,
				59	MINUTES = 38,
				60	SECONDS = 39,
				61	YEARS = 40,
				62	DIGITS = 41,
				63	SIGNEDDIGITS = 42,
				64	ZERO = 43,
				65	ONE = 44,
				66	TWO = 45,
				67	THREE = 46,
				68	FOUR = 47,
				69	FIVE = 48,
				70	SIX = 49,
				71	SEVEN = 50,
				72	EIGHT = 51,
				73	NINE = 52,
				74	TEN = 53,
				75	ELEVEN = 54,
				76	TWELVE = 55,
				77	THIRTEEN = 56,
				78	FOURTEEN = 57,
				79	FIFTEEN = 58,
				80	SIXTEEN = 59,
				81	SEVENTEEN = 60,
				82	EIGHTEEN = 61,
				83	NINETEEN = 62,
				84	TWENTY = 63,
				85	THIRTY = 64,
				86	FORTY = 65,
				87	FIFTY = 66,
				88	SIXTY = 67,
				89	SEVENTY = 68,
				90	EIGHTY = 69,
				91	NINETY = 70,
				92	HUNDRED = 71,
				93	THOUSAND = 72,
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	94	}
				95
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	96	// Options for the model that predicts text selection.
				97	namespace libtextclassifier2;
				98	table SelectionModelOptions {
				99	// If true, before the selection is returned, the unpaired brackets contained
				100	// in the predicted selection are stripped from the both selection ends.
				101	// The bracket codepoints are defined in the Unicode standard:
				102	// http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
				103	strip_unpaired_brackets:bool = 1;
				104
				105	// Number of hypothetical click positions on either side of the actual click
				106	// to consider in order to enforce symmetry.
				107	symmetry_context_size:int;
				108
				109	// Number of examples to bundle in one batch for inference.
				110	batch_size:int = 1024;
				111	}
				112
				113	// Options for the model that classifies a text selection.
				114	namespace libtextclassifier2;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	115	table ClassificationModelOptions {
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	116	// Limits for phone numbers.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	117	phone_min_num_digits:int = 7;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	118
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	119	phone_max_num_digits:int = 15;
				120	}
				121
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	122	// List of regular expression matchers to check.
				123	namespace libtextclassifier2.RegexModel_;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	124	table Pattern {
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	125	// The name of the collection of a match.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	126	collection_name:string;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	127
				128	// The pattern to check.
				129	// Can specify a single capturing group used as match boundaries.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	130	pattern:string;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	131
				132	// Whether to apply the pattern for annotation.
				133	enabled_for_annotation:bool = 0;
				134
				135	// Whether to apply the pattern for classification.
				136	enabled_for_classification:bool = 0;
				137
				138	// Whether to apply the pattern for selection.
				139	enabled_for_selection:bool = 0;
				140
				141	// The final score to assign to the results of this pattern.
				142	target_classification_score:float = 1;
				143
Lukas Zilka	df710db	2018-02-27 12:44:09 +0100	[diff] [blame^]	144	// Priority score used for conflict resolution with the other models.
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	145	priority_score:float = 0;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	146	}
				147
				148	namespace libtextclassifier2;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	149	table RegexModel {
				150	patterns:[libtextclassifier2.RegexModel_.Pattern];
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	151	}
				152
				153	namespace libtextclassifier2;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	154	table DatetimeModelPattern {
				155	// List of regex patterns.
				156	regexes:[string];
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	157
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	158	// List of locale indices in DatetimeModel that represent the locales that
				159	// these patterns should be used for. If empty, can be used for all locales.
				160	locales:[int];
				161
				162	// The final score to assign to the results of this pattern.
				163	target_classification_score:float = 1;
				164
				165	// Priority score used for conflict resulution with the other models.
				166	priority_score:float = 0;
				167	}
				168
				169	namespace libtextclassifier2;
				170	table DatetimeModelExtractor {
				171	extractor:libtextclassifier2.DatetimeExtractorType;
				172	pattern:string;
				173	locales:[int];
				174	}
				175
				176	namespace libtextclassifier2;
				177	table DatetimeModel {
				178	// List of BCP 47 locale strings representing all locales supported by the
				179	// model. The individual patterns refer back to them using an index.
				180	locales:[string];
				181
				182	patterns:[libtextclassifier2.DatetimeModelPattern];
				183	extractors:[libtextclassifier2.DatetimeModelExtractor];
				184	}
				185
				186	// Options controlling the output of the models.
				187	namespace libtextclassifier2;
				188	table ModelTriggeringOptions {
				189	// Lower bound threshold for filtering annotation model outputs.
				190	min_annotate_confidence:float = 0;
				191	}
				192
				193	namespace libtextclassifier2;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	194	table Model {
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	195	// Comma-separated list of locales supported by the model as BCP 47 tags.
				196	locales:string;
				197
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	198	version:int;
				199	selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
				200	classification_feature_options:libtextclassifier2.FeatureProcessorOptions;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	201
				202	// TFLite models.
				203	selection_model:[ubyte] (force_align: 16);
				204
				205	classification_model:[ubyte] (force_align: 16);
				206	embedding_model:[ubyte] (force_align: 16);
				207	regex_model:libtextclassifier2.RegexModel;
				208
				209	// Options for the different models.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	210	selection_options:libtextclassifier2.SelectionModelOptions;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	211
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	212	classification_options:libtextclassifier2.ClassificationModelOptions;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	213	datetime_model:libtextclassifier2.DatetimeModel;
				214
				215	// Options controlling the output of the models.
				216	triggering_options:libtextclassifier2.ModelTriggeringOptions;
Lukas Zilka	df710db	2018-02-27 12:44:09 +0100	[diff] [blame^]	217
				218	// A name for the model that can be used for e.g. logging.
				219	name:string;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	220	}
				221
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	222	// Role of the codepoints in the range.
				223	namespace libtextclassifier2.TokenizationCodepointRange_;
				224	enum Role : int {
				225	// Concatenates the codepoint to the current run of codepoints.
				226	DEFAULT_ROLE = 0,
				227
				228	// Splits a run of codepoints before the current codepoint.
				229	SPLIT_BEFORE = 1,
				230
				231	// Splits a run of codepoints after the current codepoint.
				232	SPLIT_AFTER = 2,
				233
				234	// Each codepoint will be a separate token. Good e.g. for Chinese
				235	// characters.
				236	TOKEN_SEPARATOR = 3,
				237
				238	// Discards the codepoint.
				239	DISCARD_CODEPOINT = 4,
				240
				241	// Common values:
				242	// Splits on the characters and discards them. Good e.g. for the space
				243	// character.
				244	WHITESPACE_SEPARATOR = 7,
				245	}
				246
				247	// Represents a codepoint range [start, end) with its role for tokenization.
				248	namespace libtextclassifier2;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	249	table TokenizationCodepointRange {
				250	start:int;
				251	end:int;
				252	role:libtextclassifier2.TokenizationCodepointRange_.Role;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	253
				254	// Integer identifier of the script this range denotes. Negative values are
				255	// reserved for Tokenizer's internal use.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	256	script_id:int;
				257	}
				258
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	259	// Method for selecting the center token.
				260	namespace libtextclassifier2.FeatureProcessorOptions_;
				261	enum CenterTokenSelectionMethod : int {
				262	DEFAULT_CENTER_TOKEN_METHOD = 0,
				263
				264	// Use click indices to determine the center token.
				265	CENTER_TOKEN_FROM_CLICK = 1,
				266
				267	// Use selection indices to get a token range, and select the middle of it
				268	// as the center token.
				269	CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	270	}
				271
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	272	// Controls the type of tokenization the model will use for the input text.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	273	namespace libtextclassifier2.FeatureProcessorOptions_;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	274	enum TokenizationType : int {
				275	INVALID_TOKENIZATION_TYPE = 0,
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	276
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	277	// Use the internal tokenizer for tokenization.
				278	INTERNAL_TOKENIZER = 1,
				279
				280	// Use ICU for tokenization.
				281	ICU = 2,
				282
				283	// First apply ICU tokenization. Then identify stretches of tokens
				284	// consisting only of codepoints in internal_tokenizer_codepoint_ranges
				285	// and re-tokenize them using the internal tokenizer.
				286	MIXED = 3,
				287	}
				288
				289	// Range of codepoints start - end, where end is exclusive.
				290	namespace libtextclassifier2.FeatureProcessorOptions_;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	291	table CodepointRange {
				292	start:int;
				293	end:int;
				294	}
				295
Lukas Zilka	df710db	2018-02-27 12:44:09 +0100	[diff] [blame^]	296	// Bounds-sensitive feature extraction configuration.
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	297	namespace libtextclassifier2.FeatureProcessorOptions_;
				298	table BoundsSensitiveFeatures {
				299	// Enables the extraction of bounds-sensitive features, instead of the click
				300	// context features.
				301	enabled:bool;
				302
				303	// The numbers of tokens to extract in specific locations relative to the
				304	// bounds.
				305	// Immediately before the span.
				306	num_tokens_before:int;
				307
				308	// Inside the span, aligned with the beginning.
				309	num_tokens_inside_left:int;
				310
				311	// Inside the span, aligned with the end.
				312	num_tokens_inside_right:int;
				313
				314	// Immediately after the span.
				315	num_tokens_after:int;
				316
				317	// If true, also extracts the tokens of the entire span and adds up their
				318	// features forming one "token" to include in the extracted features.
				319	include_inside_bag:bool;
				320
				321	// If true, includes the selection length (in the number of tokens) as a
				322	// feature.
				323	include_inside_length:bool;
				324	}
				325
				326	namespace libtextclassifier2.FeatureProcessorOptions_;
				327	table AlternativeCollectionMapEntry {
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	328	key:string;
				329	value:string;
				330	}
				331
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	332	namespace libtextclassifier2;
				333	table FeatureProcessorOptions {
				334	// Number of buckets used for hashing charactergrams.
				335	num_buckets:int = -1;
				336
				337	// Size of the embedding.
				338	embedding_size:int = -1;
				339
				340	// Context size defines the number of words to the left and to the right of
				341	// the selected word to be used as context. For example, if context size is
				342	// N, then we take N words to the left and N words to the right of the
				343	// selected word as its context.
				344	context_size:int = -1;
				345
				346	// Maximum number of words of the context to select in total.
				347	max_selection_span:int = -1;
				348
				349	// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
				350	// character trigrams etc.
				351	chargram_orders:[int];
				352
				353	// Maximum length of a word, in codepoints.
				354	max_word_length:int = 20;
				355
				356	// If true, will use the unicode-aware functionality for extracting features.
				357	unicode_aware_features:bool = 0;
				358
				359	// Whether to extract the token case feature.
				360	extract_case_feature:bool = 0;
				361
				362	// Whether to extract the selection mask feature.
				363	extract_selection_mask_feature:bool = 0;
				364
				365	// List of regexps to run over each token. For each regexp, if there is a
				366	// match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
				367	regexp_feature:[string];
				368
				369	// Whether to remap all digits to a single number.
				370	remap_digits:bool = 0;
				371
				372	// Whether to lower-case each token before generating hashgrams.
				373	lowercase_tokens:bool;
				374
				375	// If true, the selection classifier output will contain only the selections
				376	// that are feasible (e.g., those that are shorter than max_selection_span),
				377	// if false, the output will be a complete cross-product of possible
				378	// selections to the left and posible selections to the right, including the
				379	// infeasible ones.
				380	// NOTE: Exists mainly for compatibility with older models that were trained
				381	// with the non-reduced output space.
				382	selection_reduced_output_space:bool = 1;
				383
				384	// Collection names.
				385	collections:[string];
				386
				387	// An index of collection in collections to be used if a collection name can't
				388	// be mapped to an id.
				389	default_collection:int = -1;
				390
				391	// If true, will split the input by lines, and only use the line that contains
				392	// the clicked token.
				393	only_use_line_with_click:bool = 0;
				394
				395	// If true, will split tokens that contain the selection boundary, at the
				396	// position of the boundary.
				397	// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
				398	split_tokens_on_selection_boundaries:bool = 0;
				399
				400	// Codepoint ranges that determine how different codepoints are tokenized.
				401	// The ranges must not overlap.
				402	tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];
				403
				404	center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;
				405
				406	// If true, span boundaries will be snapped to containing tokens and not
				407	// required to exactly match token boundaries.
				408	snap_label_span_boundaries_to_containing_tokens:bool;
				409
				410	// A set of codepoint ranges supported by the model.
				411	supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
				412
				413	// A set of codepoint ranges to use in the mixed tokenization mode to identify
				414	// stretches of tokens to re-tokenize using the internal tokenizer.
				415	internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
				416
				417	// Minimum ratio of supported codepoints in the input context. If the ratio
				418	// is lower than this, the feature computation will fail.
				419	min_supported_codepoint_ratio:float = 0;
				420
				421	// Used for versioning the format of features the model expects.
				422	// - feature_version == 0:
				423	// For each token the features consist of:
				424	// - chargram embeddings
				425	// - dense features
				426	// Chargram embeddings for tokens are concatenated first together,
				427	// and at the end, the dense features for the tokens are concatenated
				428	// to it. So the resulting feature vector has two regions.
				429	feature_version:int = 0;
				430
				431	tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType;
				432	icu_preserve_whitespace_tokens:bool = 0;
				433
				434	// List of codepoints that will be stripped from beginning and end of
				435	// predicted spans.
				436	ignored_span_boundary_codepoints:[int];
				437
				438	bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;
				439
				440	// List of allowed charactergrams. The extracted charactergrams are filtered
				441	// using this list, and charactergrams that are not present are interpreted as
				442	// out-of-vocabulary.
				443	// If no allowed_chargrams are specified, all charactergrams are allowed.
				444	// The field is typed as bytes type to allow non-UTF8 chargrams.
				445	allowed_chargrams:[string];
				446
				447	// If true, tokens will be also split when the codepoint's script_id changes
				448	// as defined in TokenizationCodepointRange.
				449	tokenize_on_script_change:bool = 0;
				450
				451	// Number of bits for quantization for embeddings.
				452	embedding_quantization_bits:int = 8;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	453	}
				454
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	455	root_type libtextclassifier2.Model;