Blame - model.fbs - platform/external/libtextclassifier

blob: 590c81503be6802a38863f0278da7006638e7074 [file] [log] [blame]

Lukas Zilka	df710db	2018-02-27 12:44:09 +0100	[diff] [blame]	1	//
				2	// Copyright (C) 2017 The Android Open Source Project
				3	//
				4	// Licensed under the Apache License, Version 2.0 (the "License");
				5	// you may not use this file except in compliance with the License.
				6	// You may obtain a copy of the License at
				7	//
				8	// http://www.apache.org/licenses/LICENSE-2.0
				9	//
				10	// Unless required by applicable law or agreed to in writing, software
				11	// distributed under the License is distributed on an "AS IS" BASIS,
				12	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	// See the License for the specific language governing permissions and
				14	// limitations under the License.
				15	//
				16
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	17	file_identifier "TC2 ";
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	18
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	19	// The possible model modes, represents a bit field.
				20	namespace libtextclassifier2;
				21	enum ModeFlag : int {
				22	NONE = 0,
				23	ANNOTATION = 1,
				24	CLASSIFICATION = 2,
				25	ANNOTATION_AND_CLASSIFICATION = 3,
				26	SELECTION = 4,
				27	ANNOTATION_AND_SELECTION = 5,
				28	CLASSIFICATION_AND_SELECTION = 6,
				29	ALL = 7,
				30	}
				31
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	32	namespace libtextclassifier2;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	33	enum DatetimeExtractorType : int {
				34	UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
				35	AM = 1,
				36	PM = 2,
				37	JANUARY = 3,
				38	FEBRUARY = 4,
				39	MARCH = 5,
				40	APRIL = 6,
				41	MAY = 7,
				42	JUNE = 8,
				43	JULY = 9,
				44	AUGUST = 10,
				45	SEPTEMBER = 11,
				46	OCTOBER = 12,
				47	NOVEMBER = 13,
				48	DECEMBER = 14,
				49	NEXT = 15,
				50	NEXT_OR_SAME = 16,
				51	LAST = 17,
				52	NOW = 18,
				53	TOMORROW = 19,
				54	YESTERDAY = 20,
				55	PAST = 21,
				56	FUTURE = 22,
				57	DAY = 23,
				58	WEEK = 24,
				59	MONTH = 25,
				60	YEAR = 26,
				61	MONDAY = 27,
				62	TUESDAY = 28,
				63	WEDNESDAY = 29,
				64	THURSDAY = 30,
				65	FRIDAY = 31,
				66	SATURDAY = 32,
				67	SUNDAY = 33,
				68	DAYS = 34,
				69	WEEKS = 35,
				70	MONTHS = 36,
				71	HOURS = 37,
				72	MINUTES = 38,
				73	SECONDS = 39,
				74	YEARS = 40,
				75	DIGITS = 41,
				76	SIGNEDDIGITS = 42,
				77	ZERO = 43,
				78	ONE = 44,
				79	TWO = 45,
				80	THREE = 46,
				81	FOUR = 47,
				82	FIVE = 48,
				83	SIX = 49,
				84	SEVEN = 50,
				85	EIGHT = 51,
				86	NINE = 52,
				87	TEN = 53,
				88	ELEVEN = 54,
				89	TWELVE = 55,
				90	THIRTEEN = 56,
				91	FOURTEEN = 57,
				92	FIFTEEN = 58,
				93	SIXTEEN = 59,
				94	SEVENTEEN = 60,
				95	EIGHTEEN = 61,
				96	NINETEEN = 62,
				97	TWENTY = 63,
				98	THIRTY = 64,
				99	FORTY = 65,
				100	FIFTY = 66,
				101	SIXTY = 67,
				102	SEVENTY = 68,
				103	EIGHTY = 69,
				104	NINETY = 70,
				105	HUNDRED = 71,
				106	THOUSAND = 72,
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	107	}
				108
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	109	// Options for the model that predicts text selection.
				110	namespace libtextclassifier2;
				111	table SelectionModelOptions {
				112	// If true, before the selection is returned, the unpaired brackets contained
				113	// in the predicted selection are stripped from the both selection ends.
				114	// The bracket codepoints are defined in the Unicode standard:
				115	// http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
				116	strip_unpaired_brackets:bool = 1;
				117
				118	// Number of hypothetical click positions on either side of the actual click
				119	// to consider in order to enforce symmetry.
				120	symmetry_context_size:int;
				121
				122	// Number of examples to bundle in one batch for inference.
				123	batch_size:int = 1024;
				124	}
				125
				126	// Options for the model that classifies a text selection.
				127	namespace libtextclassifier2;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	128	table ClassificationModelOptions {
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	129	// Limits for phone numbers.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	130	phone_min_num_digits:int = 7;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	131
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	132	phone_max_num_digits:int = 15;
				133	}
				134
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	135	// List of regular expression matchers to check.
				136	namespace libtextclassifier2.RegexModel_;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	137	table Pattern {
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	138	// The name of the collection of a match.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	139	collection_name:string;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	140
				141	// The pattern to check.
				142	// Can specify a single capturing group used as match boundaries.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	143	pattern:string;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	144
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	145	// The modes for which to apply the patterns.
				146	enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	147
				148	// The final score to assign to the results of this pattern.
				149	target_classification_score:float = 1;
				150
Lukas Zilka	df710db	2018-02-27 12:44:09 +0100	[diff] [blame]	151	// Priority score used for conflict resolution with the other models.
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	152	priority_score:float = 0;
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	153
				154	// If true, will use an approximate matching implementation implemented
				155	// using Find() instead of the true Match(). This approximate matching will
				156	// use the first Find() result and then check that it spans the whole input.
				157	use_approximate_matching:bool = 0;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	158	}
				159
				160	namespace libtextclassifier2;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	161	table RegexModel {
				162	patterns:[libtextclassifier2.RegexModel_.Pattern];
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	163	}
				164
				165	namespace libtextclassifier2;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	166	table DatetimeModelPattern {
				167	// List of regex patterns.
				168	regexes:[string];
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	169
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	170	// List of locale indices in DatetimeModel that represent the locales that
				171	// these patterns should be used for. If empty, can be used for all locales.
				172	locales:[int];
				173
				174	// The final score to assign to the results of this pattern.
				175	target_classification_score:float = 1;
				176
				177	// Priority score used for conflict resulution with the other models.
				178	priority_score:float = 0;
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	179
				180	// The modes for which to apply the patterns.
				181	enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	182	}
				183
				184	namespace libtextclassifier2;
				185	table DatetimeModelExtractor {
				186	extractor:libtextclassifier2.DatetimeExtractorType;
				187	pattern:string;
				188	locales:[int];
				189	}
				190
				191	namespace libtextclassifier2;
				192	table DatetimeModel {
				193	// List of BCP 47 locale strings representing all locales supported by the
				194	// model. The individual patterns refer back to them using an index.
				195	locales:[string];
				196
				197	patterns:[libtextclassifier2.DatetimeModelPattern];
				198	extractors:[libtextclassifier2.DatetimeModelExtractor];
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	199
				200	// If true, will use the extractors for determining the match location as
				201	// opposed to using the location where the global pattern matched.
				202	use_extractors_for_locating:bool = 1;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	203	}
				204
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	205	// Options controlling the output of the Tensorflow Lite models.
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	206	namespace libtextclassifier2;
				207	table ModelTriggeringOptions {
				208	// Lower bound threshold for filtering annotation model outputs.
				209	min_annotate_confidence:float = 0;
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	210
				211	// The modes for which to enable the models.
				212	enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	213	}
				214
				215	namespace libtextclassifier2;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	216	table Model {
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	217	// Comma-separated list of locales supported by the model as BCP 47 tags.
				218	locales:string;
				219
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	220	version:int;
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	221
				222	// A name for the model that can be used for e.g. logging.
				223	name:string;
				224
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	225	selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
				226	classification_feature_options:libtextclassifier2.FeatureProcessorOptions;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	227
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	228	// Tensorflow Lite models.
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	229	selection_model:[ubyte] (force_align: 16);
				230
				231	classification_model:[ubyte] (force_align: 16);
				232	embedding_model:[ubyte] (force_align: 16);
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	233
				234	// Options for the different models.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	235	selection_options:libtextclassifier2.SelectionModelOptions;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	236
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	237	classification_options:libtextclassifier2.ClassificationModelOptions;
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	238	regex_model:libtextclassifier2.RegexModel;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	239	datetime_model:libtextclassifier2.DatetimeModel;
				240
				241	// Options controlling the output of the models.
				242	triggering_options:libtextclassifier2.ModelTriggeringOptions;
Lukas Zilka	df710db	2018-02-27 12:44:09 +0100	[diff] [blame]	243
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	244	// Global switch that controls if SuggestSelection(), ClassifyText() and
				245	// Annotate() will run. If a mode is disabled it returns empty/no-op results.
				246	enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	247	}
				248
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	249	// Role of the codepoints in the range.
				250	namespace libtextclassifier2.TokenizationCodepointRange_;
				251	enum Role : int {
				252	// Concatenates the codepoint to the current run of codepoints.
				253	DEFAULT_ROLE = 0,
				254
				255	// Splits a run of codepoints before the current codepoint.
				256	SPLIT_BEFORE = 1,
				257
				258	// Splits a run of codepoints after the current codepoint.
				259	SPLIT_AFTER = 2,
				260
				261	// Each codepoint will be a separate token. Good e.g. for Chinese
				262	// characters.
				263	TOKEN_SEPARATOR = 3,
				264
				265	// Discards the codepoint.
				266	DISCARD_CODEPOINT = 4,
				267
				268	// Common values:
				269	// Splits on the characters and discards them. Good e.g. for the space
				270	// character.
				271	WHITESPACE_SEPARATOR = 7,
				272	}
				273
				274	// Represents a codepoint range [start, end) with its role for tokenization.
				275	namespace libtextclassifier2;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	276	table TokenizationCodepointRange {
				277	start:int;
				278	end:int;
				279	role:libtextclassifier2.TokenizationCodepointRange_.Role;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	280
				281	// Integer identifier of the script this range denotes. Negative values are
				282	// reserved for Tokenizer's internal use.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	283	script_id:int;
				284	}
				285
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	286	// Method for selecting the center token.
				287	namespace libtextclassifier2.FeatureProcessorOptions_;
				288	enum CenterTokenSelectionMethod : int {
				289	DEFAULT_CENTER_TOKEN_METHOD = 0,
				290
				291	// Use click indices to determine the center token.
				292	CENTER_TOKEN_FROM_CLICK = 1,
				293
				294	// Use selection indices to get a token range, and select the middle of it
				295	// as the center token.
				296	CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	297	}
				298
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	299	// Controls the type of tokenization the model will use for the input text.
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	300	namespace libtextclassifier2.FeatureProcessorOptions_;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	301	enum TokenizationType : int {
				302	INVALID_TOKENIZATION_TYPE = 0,
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	303
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	304	// Use the internal tokenizer for tokenization.
				305	INTERNAL_TOKENIZER = 1,
				306
				307	// Use ICU for tokenization.
				308	ICU = 2,
				309
				310	// First apply ICU tokenization. Then identify stretches of tokens
				311	// consisting only of codepoints in internal_tokenizer_codepoint_ranges
				312	// and re-tokenize them using the internal tokenizer.
				313	MIXED = 3,
				314	}
				315
				316	// Range of codepoints start - end, where end is exclusive.
				317	namespace libtextclassifier2.FeatureProcessorOptions_;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	318	table CodepointRange {
				319	start:int;
				320	end:int;
				321	}
				322
Lukas Zilka	df710db	2018-02-27 12:44:09 +0100	[diff] [blame]	323	// Bounds-sensitive feature extraction configuration.
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	324	namespace libtextclassifier2.FeatureProcessorOptions_;
				325	table BoundsSensitiveFeatures {
				326	// Enables the extraction of bounds-sensitive features, instead of the click
				327	// context features.
				328	enabled:bool;
				329
				330	// The numbers of tokens to extract in specific locations relative to the
				331	// bounds.
				332	// Immediately before the span.
				333	num_tokens_before:int;
				334
				335	// Inside the span, aligned with the beginning.
				336	num_tokens_inside_left:int;
				337
				338	// Inside the span, aligned with the end.
				339	num_tokens_inside_right:int;
				340
				341	// Immediately after the span.
				342	num_tokens_after:int;
				343
				344	// If true, also extracts the tokens of the entire span and adds up their
				345	// features forming one "token" to include in the extracted features.
				346	include_inside_bag:bool;
				347
				348	// If true, includes the selection length (in the number of tokens) as a
				349	// feature.
				350	include_inside_length:bool;
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	351
				352	// If true, for selection, single token spans are not run through the model
				353	// and their score is assumed to be zero.
				354	score_single_token_spans_as_zero:bool;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	355	}
				356
				357	namespace libtextclassifier2.FeatureProcessorOptions_;
				358	table AlternativeCollectionMapEntry {
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	359	key:string;
				360	value:string;
				361	}
				362
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	363	namespace libtextclassifier2;
				364	table FeatureProcessorOptions {
				365	// Number of buckets used for hashing charactergrams.
				366	num_buckets:int = -1;
				367
				368	// Size of the embedding.
				369	embedding_size:int = -1;
				370
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	371	// Number of bits for quantization for embeddings.
				372	embedding_quantization_bits:int = 8;
				373
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	374	// Context size defines the number of words to the left and to the right of
				375	// the selected word to be used as context. For example, if context size is
				376	// N, then we take N words to the left and N words to the right of the
				377	// selected word as its context.
				378	context_size:int = -1;
				379
				380	// Maximum number of words of the context to select in total.
				381	max_selection_span:int = -1;
				382
				383	// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
				384	// character trigrams etc.
				385	chargram_orders:[int];
				386
				387	// Maximum length of a word, in codepoints.
				388	max_word_length:int = 20;
				389
				390	// If true, will use the unicode-aware functionality for extracting features.
				391	unicode_aware_features:bool = 0;
				392
				393	// Whether to extract the token case feature.
				394	extract_case_feature:bool = 0;
				395
				396	// Whether to extract the selection mask feature.
				397	extract_selection_mask_feature:bool = 0;
				398
				399	// List of regexps to run over each token. For each regexp, if there is a
				400	// match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
				401	regexp_feature:[string];
				402
				403	// Whether to remap all digits to a single number.
				404	remap_digits:bool = 0;
				405
				406	// Whether to lower-case each token before generating hashgrams.
				407	lowercase_tokens:bool;
				408
				409	// If true, the selection classifier output will contain only the selections
				410	// that are feasible (e.g., those that are shorter than max_selection_span),
				411	// if false, the output will be a complete cross-product of possible
				412	// selections to the left and posible selections to the right, including the
				413	// infeasible ones.
				414	// NOTE: Exists mainly for compatibility with older models that were trained
				415	// with the non-reduced output space.
				416	selection_reduced_output_space:bool = 1;
				417
				418	// Collection names.
				419	collections:[string];
				420
				421	// An index of collection in collections to be used if a collection name can't
				422	// be mapped to an id.
				423	default_collection:int = -1;
				424
				425	// If true, will split the input by lines, and only use the line that contains
				426	// the clicked token.
				427	only_use_line_with_click:bool = 0;
				428
				429	// If true, will split tokens that contain the selection boundary, at the
				430	// position of the boundary.
				431	// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
				432	split_tokens_on_selection_boundaries:bool = 0;
				433
				434	// Codepoint ranges that determine how different codepoints are tokenized.
				435	// The ranges must not overlap.
				436	tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];
				437
				438	center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;
				439
				440	// If true, span boundaries will be snapped to containing tokens and not
				441	// required to exactly match token boundaries.
				442	snap_label_span_boundaries_to_containing_tokens:bool;
				443
				444	// A set of codepoint ranges supported by the model.
				445	supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
				446
				447	// A set of codepoint ranges to use in the mixed tokenization mode to identify
				448	// stretches of tokens to re-tokenize using the internal tokenizer.
				449	internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
				450
				451	// Minimum ratio of supported codepoints in the input context. If the ratio
				452	// is lower than this, the feature computation will fail.
				453	min_supported_codepoint_ratio:float = 0;
				454
				455	// Used for versioning the format of features the model expects.
				456	// - feature_version == 0:
				457	// For each token the features consist of:
				458	// - chargram embeddings
				459	// - dense features
				460	// Chargram embeddings for tokens are concatenated first together,
				461	// and at the end, the dense features for the tokens are concatenated
				462	// to it. So the resulting feature vector has two regions.
				463	feature_version:int = 0;
				464
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame^]	465	tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType = INTERNAL_TOKENIZER;
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	466	icu_preserve_whitespace_tokens:bool = 0;
				467
				468	// List of codepoints that will be stripped from beginning and end of
				469	// predicted spans.
				470	ignored_span_boundary_codepoints:[int];
				471
				472	bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;
				473
				474	// List of allowed charactergrams. The extracted charactergrams are filtered
				475	// using this list, and charactergrams that are not present are interpreted as
				476	// out-of-vocabulary.
				477	// If no allowed_chargrams are specified, all charactergrams are allowed.
				478	// The field is typed as bytes type to allow non-UTF8 chargrams.
				479	allowed_chargrams:[string];
				480
				481	// If true, tokens will be also split when the codepoint's script_id changes
				482	// as defined in TokenizationCodepointRange.
				483	tokenize_on_script_change:bool = 0;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	484	}
				485
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	486	root_type libtextclassifier2.Model;