blob: 590c81503be6802a38863f0278da7006638e7074 [file] [log] [blame]
Lukas Zilkadf710db2018-02-27 12:44:09 +01001//
2// Copyright (C) 2017 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
Lukas Zilkab23e2122018-02-09 10:25:19 +010017file_identifier "TC2 ";
Lukas Zilka21d8c982018-01-24 11:11:20 +010018
Lukas Zilkaba849e72018-03-08 14:48:21 +010019// The possible model modes, represents a bit field.
20namespace libtextclassifier2;
21enum ModeFlag : int {
22 NONE = 0,
23 ANNOTATION = 1,
24 CLASSIFICATION = 2,
25 ANNOTATION_AND_CLASSIFICATION = 3,
26 SELECTION = 4,
27 ANNOTATION_AND_SELECTION = 5,
28 CLASSIFICATION_AND_SELECTION = 6,
29 ALL = 7,
30}
31
Lukas Zilka21d8c982018-01-24 11:11:20 +010032namespace libtextclassifier2;
Lukas Zilkab23e2122018-02-09 10:25:19 +010033enum DatetimeExtractorType : int {
34 UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
35 AM = 1,
36 PM = 2,
37 JANUARY = 3,
38 FEBRUARY = 4,
39 MARCH = 5,
40 APRIL = 6,
41 MAY = 7,
42 JUNE = 8,
43 JULY = 9,
44 AUGUST = 10,
45 SEPTEMBER = 11,
46 OCTOBER = 12,
47 NOVEMBER = 13,
48 DECEMBER = 14,
49 NEXT = 15,
50 NEXT_OR_SAME = 16,
51 LAST = 17,
52 NOW = 18,
53 TOMORROW = 19,
54 YESTERDAY = 20,
55 PAST = 21,
56 FUTURE = 22,
57 DAY = 23,
58 WEEK = 24,
59 MONTH = 25,
60 YEAR = 26,
61 MONDAY = 27,
62 TUESDAY = 28,
63 WEDNESDAY = 29,
64 THURSDAY = 30,
65 FRIDAY = 31,
66 SATURDAY = 32,
67 SUNDAY = 33,
68 DAYS = 34,
69 WEEKS = 35,
70 MONTHS = 36,
71 HOURS = 37,
72 MINUTES = 38,
73 SECONDS = 39,
74 YEARS = 40,
75 DIGITS = 41,
76 SIGNEDDIGITS = 42,
77 ZERO = 43,
78 ONE = 44,
79 TWO = 45,
80 THREE = 46,
81 FOUR = 47,
82 FIVE = 48,
83 SIX = 49,
84 SEVEN = 50,
85 EIGHT = 51,
86 NINE = 52,
87 TEN = 53,
88 ELEVEN = 54,
89 TWELVE = 55,
90 THIRTEEN = 56,
91 FOURTEEN = 57,
92 FIFTEEN = 58,
93 SIXTEEN = 59,
94 SEVENTEEN = 60,
95 EIGHTEEN = 61,
96 NINETEEN = 62,
97 TWENTY = 63,
98 THIRTY = 64,
99 FORTY = 65,
100 FIFTY = 66,
101 SIXTY = 67,
102 SEVENTY = 68,
103 EIGHTY = 69,
104 NINETY = 70,
105 HUNDRED = 71,
106 THOUSAND = 72,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100107}
108
Lukas Zilkab23e2122018-02-09 10:25:19 +0100109// Options for the model that predicts text selection.
110namespace libtextclassifier2;
111table SelectionModelOptions {
112 // If true, before the selection is returned, the unpaired brackets contained
113 // in the predicted selection are stripped from the both selection ends.
114 // The bracket codepoints are defined in the Unicode standard:
115 // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
116 strip_unpaired_brackets:bool = 1;
117
118 // Number of hypothetical click positions on either side of the actual click
119 // to consider in order to enforce symmetry.
120 symmetry_context_size:int;
121
122 // Number of examples to bundle in one batch for inference.
123 batch_size:int = 1024;
124}
125
126// Options for the model that classifies a text selection.
127namespace libtextclassifier2;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100128table ClassificationModelOptions {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100129 // Limits for phone numbers.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100130 phone_min_num_digits:int = 7;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100131
Lukas Zilka21d8c982018-01-24 11:11:20 +0100132 phone_max_num_digits:int = 15;
133}
134
Lukas Zilkab23e2122018-02-09 10:25:19 +0100135// List of regular expression matchers to check.
136namespace libtextclassifier2.RegexModel_;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100137table Pattern {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100138 // The name of the collection of a match.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100139 collection_name:string;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100140
141 // The pattern to check.
142 // Can specify a single capturing group used as match boundaries.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100143 pattern:string;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100144
Lukas Zilkaba849e72018-03-08 14:48:21 +0100145 // The modes for which to apply the patterns.
146 enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100147
148 // The final score to assign to the results of this pattern.
149 target_classification_score:float = 1;
150
Lukas Zilkadf710db2018-02-27 12:44:09 +0100151 // Priority score used for conflict resolution with the other models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100152 priority_score:float = 0;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100153
154 // If true, will use an approximate matching implementation implemented
155 // using Find() instead of the true Match(). This approximate matching will
156 // use the first Find() result and then check that it spans the whole input.
157 use_approximate_matching:bool = 0;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100158}
159
160namespace libtextclassifier2;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100161table RegexModel {
162 patterns:[libtextclassifier2.RegexModel_.Pattern];
Lukas Zilka21d8c982018-01-24 11:11:20 +0100163}
164
165namespace libtextclassifier2;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100166table DatetimeModelPattern {
167 // List of regex patterns.
168 regexes:[string];
Lukas Zilka21d8c982018-01-24 11:11:20 +0100169
Lukas Zilkab23e2122018-02-09 10:25:19 +0100170 // List of locale indices in DatetimeModel that represent the locales that
171 // these patterns should be used for. If empty, can be used for all locales.
172 locales:[int];
173
174 // The final score to assign to the results of this pattern.
175 target_classification_score:float = 1;
176
177 // Priority score used for conflict resulution with the other models.
178 priority_score:float = 0;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100179
180 // The modes for which to apply the patterns.
181 enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100182}
183
184namespace libtextclassifier2;
185table DatetimeModelExtractor {
186 extractor:libtextclassifier2.DatetimeExtractorType;
187 pattern:string;
188 locales:[int];
189}
190
191namespace libtextclassifier2;
192table DatetimeModel {
193 // List of BCP 47 locale strings representing all locales supported by the
194 // model. The individual patterns refer back to them using an index.
195 locales:[string];
196
197 patterns:[libtextclassifier2.DatetimeModelPattern];
198 extractors:[libtextclassifier2.DatetimeModelExtractor];
Lukas Zilkaba849e72018-03-08 14:48:21 +0100199
200 // If true, will use the extractors for determining the match location as
201 // opposed to using the location where the global pattern matched.
202 use_extractors_for_locating:bool = 1;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100203}
204
Lukas Zilkaba849e72018-03-08 14:48:21 +0100205// Options controlling the output of the Tensorflow Lite models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100206namespace libtextclassifier2;
207table ModelTriggeringOptions {
208 // Lower bound threshold for filtering annotation model outputs.
209 min_annotate_confidence:float = 0;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100210
211 // The modes for which to enable the models.
212 enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100213}
214
215namespace libtextclassifier2;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100216table Model {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100217 // Comma-separated list of locales supported by the model as BCP 47 tags.
218 locales:string;
219
Lukas Zilka21d8c982018-01-24 11:11:20 +0100220 version:int;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100221
222 // A name for the model that can be used for e.g. logging.
223 name:string;
224
Lukas Zilka21d8c982018-01-24 11:11:20 +0100225 selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
226 classification_feature_options:libtextclassifier2.FeatureProcessorOptions;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100227
Lukas Zilkaba849e72018-03-08 14:48:21 +0100228 // Tensorflow Lite models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100229 selection_model:[ubyte] (force_align: 16);
230
231 classification_model:[ubyte] (force_align: 16);
232 embedding_model:[ubyte] (force_align: 16);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100233
234 // Options for the different models.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100235 selection_options:libtextclassifier2.SelectionModelOptions;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100236
Lukas Zilka21d8c982018-01-24 11:11:20 +0100237 classification_options:libtextclassifier2.ClassificationModelOptions;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100238 regex_model:libtextclassifier2.RegexModel;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100239 datetime_model:libtextclassifier2.DatetimeModel;
240
241 // Options controlling the output of the models.
242 triggering_options:libtextclassifier2.ModelTriggeringOptions;
Lukas Zilkadf710db2018-02-27 12:44:09 +0100243
Lukas Zilkaba849e72018-03-08 14:48:21 +0100244 // Global switch that controls if SuggestSelection(), ClassifyText() and
245 // Annotate() will run. If a mode is disabled it returns empty/no-op results.
246 enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100247}
248
Lukas Zilkab23e2122018-02-09 10:25:19 +0100249// Role of the codepoints in the range.
250namespace libtextclassifier2.TokenizationCodepointRange_;
251enum Role : int {
252 // Concatenates the codepoint to the current run of codepoints.
253 DEFAULT_ROLE = 0,
254
255 // Splits a run of codepoints before the current codepoint.
256 SPLIT_BEFORE = 1,
257
258 // Splits a run of codepoints after the current codepoint.
259 SPLIT_AFTER = 2,
260
261 // Each codepoint will be a separate token. Good e.g. for Chinese
262 // characters.
263 TOKEN_SEPARATOR = 3,
264
265 // Discards the codepoint.
266 DISCARD_CODEPOINT = 4,
267
268 // Common values:
269 // Splits on the characters and discards them. Good e.g. for the space
270 // character.
271 WHITESPACE_SEPARATOR = 7,
272}
273
274// Represents a codepoint range [start, end) with its role for tokenization.
275namespace libtextclassifier2;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100276table TokenizationCodepointRange {
277 start:int;
278 end:int;
279 role:libtextclassifier2.TokenizationCodepointRange_.Role;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100280
281 // Integer identifier of the script this range denotes. Negative values are
282 // reserved for Tokenizer's internal use.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100283 script_id:int;
284}
285
Lukas Zilkab23e2122018-02-09 10:25:19 +0100286// Method for selecting the center token.
287namespace libtextclassifier2.FeatureProcessorOptions_;
288enum CenterTokenSelectionMethod : int {
289 DEFAULT_CENTER_TOKEN_METHOD = 0,
290
291 // Use click indices to determine the center token.
292 CENTER_TOKEN_FROM_CLICK = 1,
293
294 // Use selection indices to get a token range, and select the middle of it
295 // as the center token.
296 CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100297}
298
Lukas Zilkab23e2122018-02-09 10:25:19 +0100299// Controls the type of tokenization the model will use for the input text.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100300namespace libtextclassifier2.FeatureProcessorOptions_;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100301enum TokenizationType : int {
302 INVALID_TOKENIZATION_TYPE = 0,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100303
Lukas Zilkab23e2122018-02-09 10:25:19 +0100304 // Use the internal tokenizer for tokenization.
305 INTERNAL_TOKENIZER = 1,
306
307 // Use ICU for tokenization.
308 ICU = 2,
309
310 // First apply ICU tokenization. Then identify stretches of tokens
311 // consisting only of codepoints in internal_tokenizer_codepoint_ranges
312 // and re-tokenize them using the internal tokenizer.
313 MIXED = 3,
314}
315
316// Range of codepoints start - end, where end is exclusive.
317namespace libtextclassifier2.FeatureProcessorOptions_;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100318table CodepointRange {
319 start:int;
320 end:int;
321}
322
Lukas Zilkadf710db2018-02-27 12:44:09 +0100323// Bounds-sensitive feature extraction configuration.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100324namespace libtextclassifier2.FeatureProcessorOptions_;
325table BoundsSensitiveFeatures {
326 // Enables the extraction of bounds-sensitive features, instead of the click
327 // context features.
328 enabled:bool;
329
330 // The numbers of tokens to extract in specific locations relative to the
331 // bounds.
332 // Immediately before the span.
333 num_tokens_before:int;
334
335 // Inside the span, aligned with the beginning.
336 num_tokens_inside_left:int;
337
338 // Inside the span, aligned with the end.
339 num_tokens_inside_right:int;
340
341 // Immediately after the span.
342 num_tokens_after:int;
343
344 // If true, also extracts the tokens of the entire span and adds up their
345 // features forming one "token" to include in the extracted features.
346 include_inside_bag:bool;
347
348 // If true, includes the selection length (in the number of tokens) as a
349 // feature.
350 include_inside_length:bool;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100351
352 // If true, for selection, single token spans are not run through the model
353 // and their score is assumed to be zero.
354 score_single_token_spans_as_zero:bool;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100355}
356
357namespace libtextclassifier2.FeatureProcessorOptions_;
358table AlternativeCollectionMapEntry {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100359 key:string;
360 value:string;
361}
362
Lukas Zilkab23e2122018-02-09 10:25:19 +0100363namespace libtextclassifier2;
364table FeatureProcessorOptions {
365 // Number of buckets used for hashing charactergrams.
366 num_buckets:int = -1;
367
368 // Size of the embedding.
369 embedding_size:int = -1;
370
Lukas Zilkaba849e72018-03-08 14:48:21 +0100371 // Number of bits for quantization for embeddings.
372 embedding_quantization_bits:int = 8;
373
Lukas Zilkab23e2122018-02-09 10:25:19 +0100374 // Context size defines the number of words to the left and to the right of
375 // the selected word to be used as context. For example, if context size is
376 // N, then we take N words to the left and N words to the right of the
377 // selected word as its context.
378 context_size:int = -1;
379
380 // Maximum number of words of the context to select in total.
381 max_selection_span:int = -1;
382
383 // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
384 // character trigrams etc.
385 chargram_orders:[int];
386
387 // Maximum length of a word, in codepoints.
388 max_word_length:int = 20;
389
390 // If true, will use the unicode-aware functionality for extracting features.
391 unicode_aware_features:bool = 0;
392
393 // Whether to extract the token case feature.
394 extract_case_feature:bool = 0;
395
396 // Whether to extract the selection mask feature.
397 extract_selection_mask_feature:bool = 0;
398
399 // List of regexps to run over each token. For each regexp, if there is a
400 // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
401 regexp_feature:[string];
402
403 // Whether to remap all digits to a single number.
404 remap_digits:bool = 0;
405
406 // Whether to lower-case each token before generating hashgrams.
407 lowercase_tokens:bool;
408
409 // If true, the selection classifier output will contain only the selections
410 // that are feasible (e.g., those that are shorter than max_selection_span),
411 // if false, the output will be a complete cross-product of possible
412 // selections to the left and posible selections to the right, including the
413 // infeasible ones.
414 // NOTE: Exists mainly for compatibility with older models that were trained
415 // with the non-reduced output space.
416 selection_reduced_output_space:bool = 1;
417
418 // Collection names.
419 collections:[string];
420
421 // An index of collection in collections to be used if a collection name can't
422 // be mapped to an id.
423 default_collection:int = -1;
424
425 // If true, will split the input by lines, and only use the line that contains
426 // the clicked token.
427 only_use_line_with_click:bool = 0;
428
429 // If true, will split tokens that contain the selection boundary, at the
430 // position of the boundary.
431 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
432 split_tokens_on_selection_boundaries:bool = 0;
433
434 // Codepoint ranges that determine how different codepoints are tokenized.
435 // The ranges must not overlap.
436 tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];
437
438 center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;
439
440 // If true, span boundaries will be snapped to containing tokens and not
441 // required to exactly match token boundaries.
442 snap_label_span_boundaries_to_containing_tokens:bool;
443
444 // A set of codepoint ranges supported by the model.
445 supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
446
447 // A set of codepoint ranges to use in the mixed tokenization mode to identify
448 // stretches of tokens to re-tokenize using the internal tokenizer.
449 internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
450
451 // Minimum ratio of supported codepoints in the input context. If the ratio
452 // is lower than this, the feature computation will fail.
453 min_supported_codepoint_ratio:float = 0;
454
455 // Used for versioning the format of features the model expects.
456 // - feature_version == 0:
457 // For each token the features consist of:
458 // - chargram embeddings
459 // - dense features
460 // Chargram embeddings for tokens are concatenated first together,
461 // and at the end, the dense features for the tokens are concatenated
462 // to it. So the resulting feature vector has two regions.
463 feature_version:int = 0;
464
Lukas Zilkaba849e72018-03-08 14:48:21 +0100465 tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType = INTERNAL_TOKENIZER;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100466 icu_preserve_whitespace_tokens:bool = 0;
467
468 // List of codepoints that will be stripped from beginning and end of
469 // predicted spans.
470 ignored_span_boundary_codepoints:[int];
471
472 bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;
473
474 // List of allowed charactergrams. The extracted charactergrams are filtered
475 // using this list, and charactergrams that are not present are interpreted as
476 // out-of-vocabulary.
477 // If no allowed_chargrams are specified, all charactergrams are allowed.
478 // The field is typed as bytes type to allow non-UTF8 chargrams.
479 allowed_chargrams:[string];
480
481 // If true, tokens will be also split when the codepoint's script_id changes
482 // as defined in TokenizationCodepointRange.
483 tokenize_on_script_change:bool = 0;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100484}
485
Lukas Zilkab23e2122018-02-09 10:25:19 +0100486root_type libtextclassifier2.Model;