blob: 2d69fe761dc421c4b9d5dcf6a3225c3158355532 [file] [log] [blame]
Lukas Zilkadf710db2018-02-27 12:44:09 +01001//
2// Copyright (C) 2017 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
Lukas Zilkab23e2122018-02-09 10:25:19 +010017file_identifier "TC2 ";
Lukas Zilka21d8c982018-01-24 11:11:20 +010018
19namespace libtextclassifier2;
Lukas Zilkab23e2122018-02-09 10:25:19 +010020enum DatetimeExtractorType : int {
21 UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
22 AM = 1,
23 PM = 2,
24 JANUARY = 3,
25 FEBRUARY = 4,
26 MARCH = 5,
27 APRIL = 6,
28 MAY = 7,
29 JUNE = 8,
30 JULY = 9,
31 AUGUST = 10,
32 SEPTEMBER = 11,
33 OCTOBER = 12,
34 NOVEMBER = 13,
35 DECEMBER = 14,
36 NEXT = 15,
37 NEXT_OR_SAME = 16,
38 LAST = 17,
39 NOW = 18,
40 TOMORROW = 19,
41 YESTERDAY = 20,
42 PAST = 21,
43 FUTURE = 22,
44 DAY = 23,
45 WEEK = 24,
46 MONTH = 25,
47 YEAR = 26,
48 MONDAY = 27,
49 TUESDAY = 28,
50 WEDNESDAY = 29,
51 THURSDAY = 30,
52 FRIDAY = 31,
53 SATURDAY = 32,
54 SUNDAY = 33,
55 DAYS = 34,
56 WEEKS = 35,
57 MONTHS = 36,
58 HOURS = 37,
59 MINUTES = 38,
60 SECONDS = 39,
61 YEARS = 40,
62 DIGITS = 41,
63 SIGNEDDIGITS = 42,
64 ZERO = 43,
65 ONE = 44,
66 TWO = 45,
67 THREE = 46,
68 FOUR = 47,
69 FIVE = 48,
70 SIX = 49,
71 SEVEN = 50,
72 EIGHT = 51,
73 NINE = 52,
74 TEN = 53,
75 ELEVEN = 54,
76 TWELVE = 55,
77 THIRTEEN = 56,
78 FOURTEEN = 57,
79 FIFTEEN = 58,
80 SIXTEEN = 59,
81 SEVENTEEN = 60,
82 EIGHTEEN = 61,
83 NINETEEN = 62,
84 TWENTY = 63,
85 THIRTY = 64,
86 FORTY = 65,
87 FIFTY = 66,
88 SIXTY = 67,
89 SEVENTY = 68,
90 EIGHTY = 69,
91 NINETY = 70,
92 HUNDRED = 71,
93 THOUSAND = 72,
Lukas Zilka21d8c982018-01-24 11:11:20 +010094}
95
Lukas Zilkab23e2122018-02-09 10:25:19 +010096// Options for the model that predicts text selection.
97namespace libtextclassifier2;
98table SelectionModelOptions {
99 // If true, before the selection is returned, the unpaired brackets contained
100 // in the predicted selection are stripped from the both selection ends.
101 // The bracket codepoints are defined in the Unicode standard:
102 // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
103 strip_unpaired_brackets:bool = 1;
104
105 // Number of hypothetical click positions on either side of the actual click
106 // to consider in order to enforce symmetry.
107 symmetry_context_size:int;
108
109 // Number of examples to bundle in one batch for inference.
110 batch_size:int = 1024;
111}
112
113// Options for the model that classifies a text selection.
114namespace libtextclassifier2;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100115table ClassificationModelOptions {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100116 // Limits for phone numbers.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100117 phone_min_num_digits:int = 7;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100118
Lukas Zilka21d8c982018-01-24 11:11:20 +0100119 phone_max_num_digits:int = 15;
120}
121
Lukas Zilkab23e2122018-02-09 10:25:19 +0100122// List of regular expression matchers to check.
123namespace libtextclassifier2.RegexModel_;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100124table Pattern {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100125 // The name of the collection of a match.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100126 collection_name:string;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100127
128 // The pattern to check.
129 // Can specify a single capturing group used as match boundaries.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100130 pattern:string;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100131
132 // Whether to apply the pattern for annotation.
133 enabled_for_annotation:bool = 0;
134
135 // Whether to apply the pattern for classification.
136 enabled_for_classification:bool = 0;
137
138 // Whether to apply the pattern for selection.
139 enabled_for_selection:bool = 0;
140
141 // The final score to assign to the results of this pattern.
142 target_classification_score:float = 1;
143
Lukas Zilkadf710db2018-02-27 12:44:09 +0100144 // Priority score used for conflict resolution with the other models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100145 priority_score:float = 0;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100146}
147
148namespace libtextclassifier2;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100149table RegexModel {
150 patterns:[libtextclassifier2.RegexModel_.Pattern];
Lukas Zilka21d8c982018-01-24 11:11:20 +0100151}
152
153namespace libtextclassifier2;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100154table DatetimeModelPattern {
155 // List of regex patterns.
156 regexes:[string];
Lukas Zilka21d8c982018-01-24 11:11:20 +0100157
Lukas Zilkab23e2122018-02-09 10:25:19 +0100158 // List of locale indices in DatetimeModel that represent the locales that
159 // these patterns should be used for. If empty, can be used for all locales.
160 locales:[int];
161
162 // The final score to assign to the results of this pattern.
163 target_classification_score:float = 1;
164
165 // Priority score used for conflict resulution with the other models.
166 priority_score:float = 0;
167}
168
169namespace libtextclassifier2;
170table DatetimeModelExtractor {
171 extractor:libtextclassifier2.DatetimeExtractorType;
172 pattern:string;
173 locales:[int];
174}
175
176namespace libtextclassifier2;
177table DatetimeModel {
178 // List of BCP 47 locale strings representing all locales supported by the
179 // model. The individual patterns refer back to them using an index.
180 locales:[string];
181
182 patterns:[libtextclassifier2.DatetimeModelPattern];
183 extractors:[libtextclassifier2.DatetimeModelExtractor];
184}
185
186// Options controlling the output of the models.
187namespace libtextclassifier2;
188table ModelTriggeringOptions {
189 // Lower bound threshold for filtering annotation model outputs.
190 min_annotate_confidence:float = 0;
191}
192
193namespace libtextclassifier2;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100194table Model {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100195 // Comma-separated list of locales supported by the model as BCP 47 tags.
196 locales:string;
197
Lukas Zilka21d8c982018-01-24 11:11:20 +0100198 version:int;
199 selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
200 classification_feature_options:libtextclassifier2.FeatureProcessorOptions;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100201
202 // TFLite models.
203 selection_model:[ubyte] (force_align: 16);
204
205 classification_model:[ubyte] (force_align: 16);
206 embedding_model:[ubyte] (force_align: 16);
207 regex_model:libtextclassifier2.RegexModel;
208
209 // Options for the different models.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100210 selection_options:libtextclassifier2.SelectionModelOptions;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100211
Lukas Zilka21d8c982018-01-24 11:11:20 +0100212 classification_options:libtextclassifier2.ClassificationModelOptions;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100213 datetime_model:libtextclassifier2.DatetimeModel;
214
215 // Options controlling the output of the models.
216 triggering_options:libtextclassifier2.ModelTriggeringOptions;
Lukas Zilkadf710db2018-02-27 12:44:09 +0100217
218 // A name for the model that can be used for e.g. logging.
219 name:string;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100220}
221
Lukas Zilkab23e2122018-02-09 10:25:19 +0100222// Role of the codepoints in the range.
223namespace libtextclassifier2.TokenizationCodepointRange_;
224enum Role : int {
225 // Concatenates the codepoint to the current run of codepoints.
226 DEFAULT_ROLE = 0,
227
228 // Splits a run of codepoints before the current codepoint.
229 SPLIT_BEFORE = 1,
230
231 // Splits a run of codepoints after the current codepoint.
232 SPLIT_AFTER = 2,
233
234 // Each codepoint will be a separate token. Good e.g. for Chinese
235 // characters.
236 TOKEN_SEPARATOR = 3,
237
238 // Discards the codepoint.
239 DISCARD_CODEPOINT = 4,
240
241 // Common values:
242 // Splits on the characters and discards them. Good e.g. for the space
243 // character.
244 WHITESPACE_SEPARATOR = 7,
245}
246
247// Represents a codepoint range [start, end) with its role for tokenization.
248namespace libtextclassifier2;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100249table TokenizationCodepointRange {
250 start:int;
251 end:int;
252 role:libtextclassifier2.TokenizationCodepointRange_.Role;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100253
254 // Integer identifier of the script this range denotes. Negative values are
255 // reserved for Tokenizer's internal use.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100256 script_id:int;
257}
258
Lukas Zilkab23e2122018-02-09 10:25:19 +0100259// Method for selecting the center token.
260namespace libtextclassifier2.FeatureProcessorOptions_;
261enum CenterTokenSelectionMethod : int {
262 DEFAULT_CENTER_TOKEN_METHOD = 0,
263
264 // Use click indices to determine the center token.
265 CENTER_TOKEN_FROM_CLICK = 1,
266
267 // Use selection indices to get a token range, and select the middle of it
268 // as the center token.
269 CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100270}
271
Lukas Zilkab23e2122018-02-09 10:25:19 +0100272// Controls the type of tokenization the model will use for the input text.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100273namespace libtextclassifier2.FeatureProcessorOptions_;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100274enum TokenizationType : int {
275 INVALID_TOKENIZATION_TYPE = 0,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100276
Lukas Zilkab23e2122018-02-09 10:25:19 +0100277 // Use the internal tokenizer for tokenization.
278 INTERNAL_TOKENIZER = 1,
279
280 // Use ICU for tokenization.
281 ICU = 2,
282
283 // First apply ICU tokenization. Then identify stretches of tokens
284 // consisting only of codepoints in internal_tokenizer_codepoint_ranges
285 // and re-tokenize them using the internal tokenizer.
286 MIXED = 3,
287}
288
289// Range of codepoints start - end, where end is exclusive.
290namespace libtextclassifier2.FeatureProcessorOptions_;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100291table CodepointRange {
292 start:int;
293 end:int;
294}
295
Lukas Zilkadf710db2018-02-27 12:44:09 +0100296// Bounds-sensitive feature extraction configuration.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100297namespace libtextclassifier2.FeatureProcessorOptions_;
298table BoundsSensitiveFeatures {
299 // Enables the extraction of bounds-sensitive features, instead of the click
300 // context features.
301 enabled:bool;
302
303 // The numbers of tokens to extract in specific locations relative to the
304 // bounds.
305 // Immediately before the span.
306 num_tokens_before:int;
307
308 // Inside the span, aligned with the beginning.
309 num_tokens_inside_left:int;
310
311 // Inside the span, aligned with the end.
312 num_tokens_inside_right:int;
313
314 // Immediately after the span.
315 num_tokens_after:int;
316
317 // If true, also extracts the tokens of the entire span and adds up their
318 // features forming one "token" to include in the extracted features.
319 include_inside_bag:bool;
320
321 // If true, includes the selection length (in the number of tokens) as a
322 // feature.
323 include_inside_length:bool;
324}
325
326namespace libtextclassifier2.FeatureProcessorOptions_;
327table AlternativeCollectionMapEntry {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100328 key:string;
329 value:string;
330}
331
Lukas Zilkab23e2122018-02-09 10:25:19 +0100332namespace libtextclassifier2;
333table FeatureProcessorOptions {
334 // Number of buckets used for hashing charactergrams.
335 num_buckets:int = -1;
336
337 // Size of the embedding.
338 embedding_size:int = -1;
339
340 // Context size defines the number of words to the left and to the right of
341 // the selected word to be used as context. For example, if context size is
342 // N, then we take N words to the left and N words to the right of the
343 // selected word as its context.
344 context_size:int = -1;
345
346 // Maximum number of words of the context to select in total.
347 max_selection_span:int = -1;
348
349 // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
350 // character trigrams etc.
351 chargram_orders:[int];
352
353 // Maximum length of a word, in codepoints.
354 max_word_length:int = 20;
355
356 // If true, will use the unicode-aware functionality for extracting features.
357 unicode_aware_features:bool = 0;
358
359 // Whether to extract the token case feature.
360 extract_case_feature:bool = 0;
361
362 // Whether to extract the selection mask feature.
363 extract_selection_mask_feature:bool = 0;
364
365 // List of regexps to run over each token. For each regexp, if there is a
366 // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
367 regexp_feature:[string];
368
369 // Whether to remap all digits to a single number.
370 remap_digits:bool = 0;
371
372 // Whether to lower-case each token before generating hashgrams.
373 lowercase_tokens:bool;
374
375 // If true, the selection classifier output will contain only the selections
376 // that are feasible (e.g., those that are shorter than max_selection_span),
377 // if false, the output will be a complete cross-product of possible
378 // selections to the left and posible selections to the right, including the
379 // infeasible ones.
380 // NOTE: Exists mainly for compatibility with older models that were trained
381 // with the non-reduced output space.
382 selection_reduced_output_space:bool = 1;
383
384 // Collection names.
385 collections:[string];
386
387 // An index of collection in collections to be used if a collection name can't
388 // be mapped to an id.
389 default_collection:int = -1;
390
391 // If true, will split the input by lines, and only use the line that contains
392 // the clicked token.
393 only_use_line_with_click:bool = 0;
394
395 // If true, will split tokens that contain the selection boundary, at the
396 // position of the boundary.
397 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
398 split_tokens_on_selection_boundaries:bool = 0;
399
400 // Codepoint ranges that determine how different codepoints are tokenized.
401 // The ranges must not overlap.
402 tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];
403
404 center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;
405
406 // If true, span boundaries will be snapped to containing tokens and not
407 // required to exactly match token boundaries.
408 snap_label_span_boundaries_to_containing_tokens:bool;
409
410 // A set of codepoint ranges supported by the model.
411 supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
412
413 // A set of codepoint ranges to use in the mixed tokenization mode to identify
414 // stretches of tokens to re-tokenize using the internal tokenizer.
415 internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
416
417 // Minimum ratio of supported codepoints in the input context. If the ratio
418 // is lower than this, the feature computation will fail.
419 min_supported_codepoint_ratio:float = 0;
420
421 // Used for versioning the format of features the model expects.
422 // - feature_version == 0:
423 // For each token the features consist of:
424 // - chargram embeddings
425 // - dense features
426 // Chargram embeddings for tokens are concatenated first together,
427 // and at the end, the dense features for the tokens are concatenated
428 // to it. So the resulting feature vector has two regions.
429 feature_version:int = 0;
430
431 tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType;
432 icu_preserve_whitespace_tokens:bool = 0;
433
434 // List of codepoints that will be stripped from beginning and end of
435 // predicted spans.
436 ignored_span_boundary_codepoints:[int];
437
438 bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;
439
440 // List of allowed charactergrams. The extracted charactergrams are filtered
441 // using this list, and charactergrams that are not present are interpreted as
442 // out-of-vocabulary.
443 // If no allowed_chargrams are specified, all charactergrams are allowed.
444 // The field is typed as bytes type to allow non-UTF8 chargrams.
445 allowed_chargrams:[string];
446
447 // If true, tokens will be also split when the codepoint's script_id changes
448 // as defined in TokenizationCodepointRange.
449 tokenize_on_script_change:bool = 0;
450
451 // Number of bits for quantization for embeddings.
452 embedding_quantization_bits:int = 8;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100453}
454
Lukas Zilkab23e2122018-02-09 10:25:19 +0100455root_type libtextclassifier2.Model;