blob: fb9778b480de49d8c35bb2bd65bc62d9d95929c5 [file] [log] [blame]
Lukas Zilkadf710db2018-02-27 12:44:09 +01001//
2// Copyright (C) 2017 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
Lukas Zilkab23e2122018-02-09 10:25:19 +010017file_identifier "TC2 ";
Lukas Zilka21d8c982018-01-24 11:11:20 +010018
Lukas Zilkaba849e72018-03-08 14:48:21 +010019// The possible model modes, represents a bit field.
20namespace libtextclassifier2;
21enum ModeFlag : int {
22 NONE = 0,
23 ANNOTATION = 1,
24 CLASSIFICATION = 2,
25 ANNOTATION_AND_CLASSIFICATION = 3,
26 SELECTION = 4,
27 ANNOTATION_AND_SELECTION = 5,
28 CLASSIFICATION_AND_SELECTION = 6,
29 ALL = 7,
30}
31
Lukas Zilka21d8c982018-01-24 11:11:20 +010032namespace libtextclassifier2;
Lukas Zilkab23e2122018-02-09 10:25:19 +010033enum DatetimeExtractorType : int {
34 UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0,
35 AM = 1,
36 PM = 2,
37 JANUARY = 3,
38 FEBRUARY = 4,
39 MARCH = 5,
40 APRIL = 6,
41 MAY = 7,
42 JUNE = 8,
43 JULY = 9,
44 AUGUST = 10,
45 SEPTEMBER = 11,
46 OCTOBER = 12,
47 NOVEMBER = 13,
48 DECEMBER = 14,
49 NEXT = 15,
50 NEXT_OR_SAME = 16,
51 LAST = 17,
52 NOW = 18,
53 TOMORROW = 19,
54 YESTERDAY = 20,
55 PAST = 21,
56 FUTURE = 22,
57 DAY = 23,
58 WEEK = 24,
59 MONTH = 25,
60 YEAR = 26,
61 MONDAY = 27,
62 TUESDAY = 28,
63 WEDNESDAY = 29,
64 THURSDAY = 30,
65 FRIDAY = 31,
66 SATURDAY = 32,
67 SUNDAY = 33,
68 DAYS = 34,
69 WEEKS = 35,
70 MONTHS = 36,
71 HOURS = 37,
72 MINUTES = 38,
73 SECONDS = 39,
74 YEARS = 40,
75 DIGITS = 41,
76 SIGNEDDIGITS = 42,
77 ZERO = 43,
78 ONE = 44,
79 TWO = 45,
80 THREE = 46,
81 FOUR = 47,
82 FIVE = 48,
83 SIX = 49,
84 SEVEN = 50,
85 EIGHT = 51,
86 NINE = 52,
87 TEN = 53,
88 ELEVEN = 54,
89 TWELVE = 55,
90 THIRTEEN = 56,
91 FOURTEEN = 57,
92 FIFTEEN = 58,
93 SIXTEEN = 59,
94 SEVENTEEN = 60,
95 EIGHTEEN = 61,
96 NINETEEN = 62,
97 TWENTY = 63,
98 THIRTY = 64,
99 FORTY = 65,
100 FIFTY = 66,
101 SIXTY = 67,
102 SEVENTY = 68,
103 EIGHTY = 69,
104 NINETY = 70,
105 HUNDRED = 71,
106 THOUSAND = 72,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100107}
108
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200109namespace libtextclassifier2;
110enum DatetimeGroupType : int {
111 GROUP_UNKNOWN = 0,
112 GROUP_UNUSED = 1,
113 GROUP_YEAR = 2,
114 GROUP_MONTH = 3,
115 GROUP_DAY = 4,
116 GROUP_HOUR = 5,
117 GROUP_MINUTE = 6,
118 GROUP_SECOND = 7,
119 GROUP_AMPM = 8,
120 GROUP_RELATIONDISTANCE = 9,
121 GROUP_RELATION = 10,
122 GROUP_RELATIONTYPE = 11,
123
124 // Dummy groups serve just as an inflator of the selection. E.g. we might want
125 // to select more text than was contained in an envelope of all extractor
126 // spans.
127 GROUP_DUMMY1 = 12,
128
129 GROUP_DUMMY2 = 13,
130}
131
132namespace libtextclassifier2;
133table CompressedBuffer {
134 buffer:[ubyte];
135 uncompressed_size:int;
136}
137
Lukas Zilkab23e2122018-02-09 10:25:19 +0100138// Options for the model that predicts text selection.
139namespace libtextclassifier2;
140table SelectionModelOptions {
141 // If true, before the selection is returned, the unpaired brackets contained
142 // in the predicted selection are stripped from the both selection ends.
143 // The bracket codepoints are defined in the Unicode standard:
144 // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
145 strip_unpaired_brackets:bool = 1;
146
147 // Number of hypothetical click positions on either side of the actual click
148 // to consider in order to enforce symmetry.
149 symmetry_context_size:int;
150
151 // Number of examples to bundle in one batch for inference.
152 batch_size:int = 1024;
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200153
154 // Whether to always classify a suggested selection or only on demand.
155 always_classify_suggested_selection:bool = 0;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100156}
157
158// Options for the model that classifies a text selection.
159namespace libtextclassifier2;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100160table ClassificationModelOptions {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100161 // Limits for phone numbers.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100162 phone_min_num_digits:int = 7;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100163
Lukas Zilka21d8c982018-01-24 11:11:20 +0100164 phone_max_num_digits:int = 15;
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200165
166 // Limits for addresses.
167 address_min_num_tokens:int;
Lukas Zilka434442d2018-04-25 11:38:51 +0200168
169 // Maximum number of tokens to attempt a classification (-1 is unlimited).
170 max_num_tokens:int = -1;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100171}
172
Lukas Zilkab23e2122018-02-09 10:25:19 +0100173// List of regular expression matchers to check.
174namespace libtextclassifier2.RegexModel_;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100175table Pattern {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100176 // The name of the collection of a match.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100177 collection_name:string;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100178
179 // The pattern to check.
180 // Can specify a single capturing group used as match boundaries.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100181 pattern:string;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100182
Lukas Zilkaba849e72018-03-08 14:48:21 +0100183 // The modes for which to apply the patterns.
184 enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100185
186 // The final score to assign to the results of this pattern.
187 target_classification_score:float = 1;
188
Lukas Zilkadf710db2018-02-27 12:44:09 +0100189 // Priority score used for conflict resolution with the other models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100190 priority_score:float = 0;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100191
192 // If true, will use an approximate matching implementation implemented
193 // using Find() instead of the true Match(). This approximate matching will
194 // use the first Find() result and then check that it spans the whole input.
195 use_approximate_matching:bool = 0;
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200196
197 compressed_pattern:libtextclassifier2.CompressedBuffer;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100198}
199
200namespace libtextclassifier2;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100201table RegexModel {
202 patterns:[libtextclassifier2.RegexModel_.Pattern];
Lukas Zilka21d8c982018-01-24 11:11:20 +0100203}
204
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200205// List of regex patterns.
206namespace libtextclassifier2.DatetimeModelPattern_;
207table Regex {
208 pattern:string;
209
210 // The ith entry specifies the type of the ith capturing group.
211 // This is used to decide how the matched content has to be parsed.
212 groups:[libtextclassifier2.DatetimeGroupType];
213
214 compressed_pattern:libtextclassifier2.CompressedBuffer;
215}
216
Lukas Zilka21d8c982018-01-24 11:11:20 +0100217namespace libtextclassifier2;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100218table DatetimeModelPattern {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200219 regexes:[libtextclassifier2.DatetimeModelPattern_.Regex];
Lukas Zilka21d8c982018-01-24 11:11:20 +0100220
Lukas Zilkab23e2122018-02-09 10:25:19 +0100221 // List of locale indices in DatetimeModel that represent the locales that
222 // these patterns should be used for. If empty, can be used for all locales.
223 locales:[int];
224
225 // The final score to assign to the results of this pattern.
226 target_classification_score:float = 1;
227
228 // Priority score used for conflict resulution with the other models.
229 priority_score:float = 0;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100230
231 // The modes for which to apply the patterns.
232 enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100233}
234
235namespace libtextclassifier2;
236table DatetimeModelExtractor {
237 extractor:libtextclassifier2.DatetimeExtractorType;
238 pattern:string;
239 locales:[int];
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200240 compressed_pattern:libtextclassifier2.CompressedBuffer;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100241}
242
243namespace libtextclassifier2;
244table DatetimeModel {
245 // List of BCP 47 locale strings representing all locales supported by the
246 // model. The individual patterns refer back to them using an index.
247 locales:[string];
248
249 patterns:[libtextclassifier2.DatetimeModelPattern];
250 extractors:[libtextclassifier2.DatetimeModelExtractor];
Lukas Zilkaba849e72018-03-08 14:48:21 +0100251
252 // If true, will use the extractors for determining the match location as
253 // opposed to using the location where the global pattern matched.
254 use_extractors_for_locating:bool = 1;
Lukas Zilka434442d2018-04-25 11:38:51 +0200255
256 // List of locale ids, rules of whose are always run, after the requested
257 // ones.
258 default_locales:[int];
259}
260
261namespace libtextclassifier2.DatetimeModelLibrary_;
262table Item {
263 key:string;
264 value:libtextclassifier2.DatetimeModel;
265}
266
267// A set of named DateTime models.
268namespace libtextclassifier2;
269table DatetimeModelLibrary {
270 models:[libtextclassifier2.DatetimeModelLibrary_.Item];
Lukas Zilkab23e2122018-02-09 10:25:19 +0100271}
272
Lukas Zilkaba849e72018-03-08 14:48:21 +0100273// Options controlling the output of the Tensorflow Lite models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100274namespace libtextclassifier2;
275table ModelTriggeringOptions {
276 // Lower bound threshold for filtering annotation model outputs.
277 min_annotate_confidence:float = 0;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100278
279 // The modes for which to enable the models.
280 enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100281}
282
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200283// Options controlling the output of the classifier.
284namespace libtextclassifier2;
285table OutputOptions {
286 // Lists of collection names that will be filtered out at the output:
287 // - For annotation, the spans of given collection are simply dropped.
288 // - For classification, the result is mapped to the class "other".
289 // - For selection, the spans of given class are returned as
290 // single-selection.
291 filtered_collections_annotation:[string];
292
293 filtered_collections_classification:[string];
294 filtered_collections_selection:[string];
295}
296
Lukas Zilkab23e2122018-02-09 10:25:19 +0100297namespace libtextclassifier2;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100298table Model {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100299 // Comma-separated list of locales supported by the model as BCP 47 tags.
300 locales:string;
301
Lukas Zilka21d8c982018-01-24 11:11:20 +0100302 version:int;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100303
304 // A name for the model that can be used for e.g. logging.
305 name:string;
306
Lukas Zilka21d8c982018-01-24 11:11:20 +0100307 selection_feature_options:libtextclassifier2.FeatureProcessorOptions;
308 classification_feature_options:libtextclassifier2.FeatureProcessorOptions;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100309
Lukas Zilkaba849e72018-03-08 14:48:21 +0100310 // Tensorflow Lite models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100311 selection_model:[ubyte] (force_align: 16);
312
313 classification_model:[ubyte] (force_align: 16);
314 embedding_model:[ubyte] (force_align: 16);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100315
316 // Options for the different models.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100317 selection_options:libtextclassifier2.SelectionModelOptions;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100318
Lukas Zilka21d8c982018-01-24 11:11:20 +0100319 classification_options:libtextclassifier2.ClassificationModelOptions;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100320 regex_model:libtextclassifier2.RegexModel;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100321 datetime_model:libtextclassifier2.DatetimeModel;
322
323 // Options controlling the output of the models.
324 triggering_options:libtextclassifier2.ModelTriggeringOptions;
Lukas Zilkadf710db2018-02-27 12:44:09 +0100325
Lukas Zilkaba849e72018-03-08 14:48:21 +0100326 // Global switch that controls if SuggestSelection(), ClassifyText() and
327 // Annotate() will run. If a mode is disabled it returns empty/no-op results.
328 enabled_modes:libtextclassifier2.ModeFlag = ALL;
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200329
330 // If true, will snap the selections that consist only of whitespaces to the
331 // containing suggested span. Otherwise, no suggestion is proposed, since the
332 // selections are not part of any token.
333 snap_whitespace_selections:bool = 1;
334
335 // Global configuration for the output of SuggestSelection(), ClassifyText()
336 // and Annotate().
337 output_options:libtextclassifier2.OutputOptions;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100338}
339
Lukas Zilkab23e2122018-02-09 10:25:19 +0100340// Role of the codepoints in the range.
341namespace libtextclassifier2.TokenizationCodepointRange_;
342enum Role : int {
343 // Concatenates the codepoint to the current run of codepoints.
344 DEFAULT_ROLE = 0,
345
346 // Splits a run of codepoints before the current codepoint.
347 SPLIT_BEFORE = 1,
348
349 // Splits a run of codepoints after the current codepoint.
350 SPLIT_AFTER = 2,
351
352 // Each codepoint will be a separate token. Good e.g. for Chinese
353 // characters.
354 TOKEN_SEPARATOR = 3,
355
356 // Discards the codepoint.
357 DISCARD_CODEPOINT = 4,
358
359 // Common values:
360 // Splits on the characters and discards them. Good e.g. for the space
361 // character.
362 WHITESPACE_SEPARATOR = 7,
363}
364
365// Represents a codepoint range [start, end) with its role for tokenization.
366namespace libtextclassifier2;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100367table TokenizationCodepointRange {
368 start:int;
369 end:int;
370 role:libtextclassifier2.TokenizationCodepointRange_.Role;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100371
372 // Integer identifier of the script this range denotes. Negative values are
373 // reserved for Tokenizer's internal use.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100374 script_id:int;
375}
376
Lukas Zilkab23e2122018-02-09 10:25:19 +0100377// Method for selecting the center token.
378namespace libtextclassifier2.FeatureProcessorOptions_;
379enum CenterTokenSelectionMethod : int {
380 DEFAULT_CENTER_TOKEN_METHOD = 0,
381
382 // Use click indices to determine the center token.
383 CENTER_TOKEN_FROM_CLICK = 1,
384
385 // Use selection indices to get a token range, and select the middle of it
386 // as the center token.
387 CENTER_TOKEN_MIDDLE_OF_SELECTION = 2,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100388}
389
Lukas Zilkab23e2122018-02-09 10:25:19 +0100390// Controls the type of tokenization the model will use for the input text.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100391namespace libtextclassifier2.FeatureProcessorOptions_;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100392enum TokenizationType : int {
393 INVALID_TOKENIZATION_TYPE = 0,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100394
Lukas Zilkab23e2122018-02-09 10:25:19 +0100395 // Use the internal tokenizer for tokenization.
396 INTERNAL_TOKENIZER = 1,
397
398 // Use ICU for tokenization.
399 ICU = 2,
400
401 // First apply ICU tokenization. Then identify stretches of tokens
402 // consisting only of codepoints in internal_tokenizer_codepoint_ranges
403 // and re-tokenize them using the internal tokenizer.
404 MIXED = 3,
405}
406
407// Range of codepoints start - end, where end is exclusive.
408namespace libtextclassifier2.FeatureProcessorOptions_;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100409table CodepointRange {
410 start:int;
411 end:int;
412}
413
Lukas Zilkadf710db2018-02-27 12:44:09 +0100414// Bounds-sensitive feature extraction configuration.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100415namespace libtextclassifier2.FeatureProcessorOptions_;
416table BoundsSensitiveFeatures {
417 // Enables the extraction of bounds-sensitive features, instead of the click
418 // context features.
419 enabled:bool;
420
421 // The numbers of tokens to extract in specific locations relative to the
422 // bounds.
423 // Immediately before the span.
424 num_tokens_before:int;
425
426 // Inside the span, aligned with the beginning.
427 num_tokens_inside_left:int;
428
429 // Inside the span, aligned with the end.
430 num_tokens_inside_right:int;
431
432 // Immediately after the span.
433 num_tokens_after:int;
434
435 // If true, also extracts the tokens of the entire span and adds up their
436 // features forming one "token" to include in the extracted features.
437 include_inside_bag:bool;
438
439 // If true, includes the selection length (in the number of tokens) as a
440 // feature.
441 include_inside_length:bool;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100442
443 // If true, for selection, single token spans are not run through the model
444 // and their score is assumed to be zero.
445 score_single_token_spans_as_zero:bool;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100446}
447
448namespace libtextclassifier2.FeatureProcessorOptions_;
449table AlternativeCollectionMapEntry {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100450 key:string;
451 value:string;
452}
453
Lukas Zilkab23e2122018-02-09 10:25:19 +0100454namespace libtextclassifier2;
455table FeatureProcessorOptions {
456 // Number of buckets used for hashing charactergrams.
457 num_buckets:int = -1;
458
459 // Size of the embedding.
460 embedding_size:int = -1;
461
Lukas Zilkaba849e72018-03-08 14:48:21 +0100462 // Number of bits for quantization for embeddings.
463 embedding_quantization_bits:int = 8;
464
Lukas Zilkab23e2122018-02-09 10:25:19 +0100465 // Context size defines the number of words to the left and to the right of
466 // the selected word to be used as context. For example, if context size is
467 // N, then we take N words to the left and N words to the right of the
468 // selected word as its context.
469 context_size:int = -1;
470
471 // Maximum number of words of the context to select in total.
472 max_selection_span:int = -1;
473
474 // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
475 // character trigrams etc.
476 chargram_orders:[int];
477
478 // Maximum length of a word, in codepoints.
479 max_word_length:int = 20;
480
481 // If true, will use the unicode-aware functionality for extracting features.
482 unicode_aware_features:bool = 0;
483
484 // Whether to extract the token case feature.
485 extract_case_feature:bool = 0;
486
487 // Whether to extract the selection mask feature.
488 extract_selection_mask_feature:bool = 0;
489
490 // List of regexps to run over each token. For each regexp, if there is a
491 // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used.
492 regexp_feature:[string];
493
494 // Whether to remap all digits to a single number.
495 remap_digits:bool = 0;
496
497 // Whether to lower-case each token before generating hashgrams.
498 lowercase_tokens:bool;
499
500 // If true, the selection classifier output will contain only the selections
501 // that are feasible (e.g., those that are shorter than max_selection_span),
502 // if false, the output will be a complete cross-product of possible
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200503 // selections to the left and possible selections to the right, including the
Lukas Zilkab23e2122018-02-09 10:25:19 +0100504 // infeasible ones.
505 // NOTE: Exists mainly for compatibility with older models that were trained
506 // with the non-reduced output space.
507 selection_reduced_output_space:bool = 1;
508
509 // Collection names.
510 collections:[string];
511
512 // An index of collection in collections to be used if a collection name can't
513 // be mapped to an id.
514 default_collection:int = -1;
515
516 // If true, will split the input by lines, and only use the line that contains
517 // the clicked token.
518 only_use_line_with_click:bool = 0;
519
520 // If true, will split tokens that contain the selection boundary, at the
521 // position of the boundary.
522 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
523 split_tokens_on_selection_boundaries:bool = 0;
524
525 // Codepoint ranges that determine how different codepoints are tokenized.
526 // The ranges must not overlap.
527 tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange];
528
529 center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod;
530
531 // If true, span boundaries will be snapped to containing tokens and not
532 // required to exactly match token boundaries.
533 snap_label_span_boundaries_to_containing_tokens:bool;
534
535 // A set of codepoint ranges supported by the model.
536 supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
537
538 // A set of codepoint ranges to use in the mixed tokenization mode to identify
539 // stretches of tokens to re-tokenize using the internal tokenizer.
540 internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange];
541
542 // Minimum ratio of supported codepoints in the input context. If the ratio
543 // is lower than this, the feature computation will fail.
544 min_supported_codepoint_ratio:float = 0;
545
546 // Used for versioning the format of features the model expects.
547 // - feature_version == 0:
548 // For each token the features consist of:
549 // - chargram embeddings
550 // - dense features
551 // Chargram embeddings for tokens are concatenated first together,
552 // and at the end, the dense features for the tokens are concatenated
553 // to it. So the resulting feature vector has two regions.
554 feature_version:int = 0;
555
Lukas Zilkaba849e72018-03-08 14:48:21 +0100556 tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType = INTERNAL_TOKENIZER;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100557 icu_preserve_whitespace_tokens:bool = 0;
558
559 // List of codepoints that will be stripped from beginning and end of
560 // predicted spans.
561 ignored_span_boundary_codepoints:[int];
562
563 bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures;
564
565 // List of allowed charactergrams. The extracted charactergrams are filtered
566 // using this list, and charactergrams that are not present are interpreted as
567 // out-of-vocabulary.
568 // If no allowed_chargrams are specified, all charactergrams are allowed.
569 // The field is typed as bytes type to allow non-UTF8 chargrams.
570 allowed_chargrams:[string];
571
572 // If true, tokens will be also split when the codepoint's script_id changes
573 // as defined in TokenizationCodepointRange.
574 tokenize_on_script_change:bool = 0;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100575}
576
Lukas Zilkab23e2122018-02-09 10:25:19 +0100577root_type libtextclassifier2.Model;