Lukas Zilka | df710db | 2018-02-27 12:44:09 +0100 | [diff] [blame] | 1 | // |
| 2 | // Copyright (C) 2017 The Android Open Source Project |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | // you may not use this file except in compliance with the License. |
| 6 | // You may obtain a copy of the License at |
| 7 | // |
| 8 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | // |
| 10 | // Unless required by applicable law or agreed to in writing, software |
| 11 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | // See the License for the specific language governing permissions and |
| 14 | // limitations under the License. |
| 15 | // |
| 16 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 17 | file_identifier "TC2 "; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 18 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 19 | // The possible model modes, represents a bit field. |
| 20 | namespace libtextclassifier2; |
| 21 | enum ModeFlag : int { |
| 22 | NONE = 0, |
| 23 | ANNOTATION = 1, |
| 24 | CLASSIFICATION = 2, |
| 25 | ANNOTATION_AND_CLASSIFICATION = 3, |
| 26 | SELECTION = 4, |
| 27 | ANNOTATION_AND_SELECTION = 5, |
| 28 | CLASSIFICATION_AND_SELECTION = 6, |
| 29 | ALL = 7, |
| 30 | } |
| 31 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 32 | namespace libtextclassifier2; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 33 | enum DatetimeExtractorType : int { |
| 34 | UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, |
| 35 | AM = 1, |
| 36 | PM = 2, |
| 37 | JANUARY = 3, |
| 38 | FEBRUARY = 4, |
| 39 | MARCH = 5, |
| 40 | APRIL = 6, |
| 41 | MAY = 7, |
| 42 | JUNE = 8, |
| 43 | JULY = 9, |
| 44 | AUGUST = 10, |
| 45 | SEPTEMBER = 11, |
| 46 | OCTOBER = 12, |
| 47 | NOVEMBER = 13, |
| 48 | DECEMBER = 14, |
| 49 | NEXT = 15, |
| 50 | NEXT_OR_SAME = 16, |
| 51 | LAST = 17, |
| 52 | NOW = 18, |
| 53 | TOMORROW = 19, |
| 54 | YESTERDAY = 20, |
| 55 | PAST = 21, |
| 56 | FUTURE = 22, |
| 57 | DAY = 23, |
| 58 | WEEK = 24, |
| 59 | MONTH = 25, |
| 60 | YEAR = 26, |
| 61 | MONDAY = 27, |
| 62 | TUESDAY = 28, |
| 63 | WEDNESDAY = 29, |
| 64 | THURSDAY = 30, |
| 65 | FRIDAY = 31, |
| 66 | SATURDAY = 32, |
| 67 | SUNDAY = 33, |
| 68 | DAYS = 34, |
| 69 | WEEKS = 35, |
| 70 | MONTHS = 36, |
| 71 | HOURS = 37, |
| 72 | MINUTES = 38, |
| 73 | SECONDS = 39, |
| 74 | YEARS = 40, |
| 75 | DIGITS = 41, |
| 76 | SIGNEDDIGITS = 42, |
| 77 | ZERO = 43, |
| 78 | ONE = 44, |
| 79 | TWO = 45, |
| 80 | THREE = 46, |
| 81 | FOUR = 47, |
| 82 | FIVE = 48, |
| 83 | SIX = 49, |
| 84 | SEVEN = 50, |
| 85 | EIGHT = 51, |
| 86 | NINE = 52, |
| 87 | TEN = 53, |
| 88 | ELEVEN = 54, |
| 89 | TWELVE = 55, |
| 90 | THIRTEEN = 56, |
| 91 | FOURTEEN = 57, |
| 92 | FIFTEEN = 58, |
| 93 | SIXTEEN = 59, |
| 94 | SEVENTEEN = 60, |
| 95 | EIGHTEEN = 61, |
| 96 | NINETEEN = 62, |
| 97 | TWENTY = 63, |
| 98 | THIRTY = 64, |
| 99 | FORTY = 65, |
| 100 | FIFTY = 66, |
| 101 | SIXTY = 67, |
| 102 | SEVENTY = 68, |
| 103 | EIGHTY = 69, |
| 104 | NINETY = 70, |
| 105 | HUNDRED = 71, |
| 106 | THOUSAND = 72, |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 107 | } |
| 108 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 109 | namespace libtextclassifier2; |
| 110 | enum DatetimeGroupType : int { |
| 111 | GROUP_UNKNOWN = 0, |
| 112 | GROUP_UNUSED = 1, |
| 113 | GROUP_YEAR = 2, |
| 114 | GROUP_MONTH = 3, |
| 115 | GROUP_DAY = 4, |
| 116 | GROUP_HOUR = 5, |
| 117 | GROUP_MINUTE = 6, |
| 118 | GROUP_SECOND = 7, |
| 119 | GROUP_AMPM = 8, |
| 120 | GROUP_RELATIONDISTANCE = 9, |
| 121 | GROUP_RELATION = 10, |
| 122 | GROUP_RELATIONTYPE = 11, |
| 123 | |
| 124 | // Dummy groups serve just as an inflator of the selection. E.g. we might want |
| 125 | // to select more text than was contained in an envelope of all extractor |
| 126 | // spans. |
| 127 | GROUP_DUMMY1 = 12, |
| 128 | |
| 129 | GROUP_DUMMY2 = 13, |
| 130 | } |
| 131 | |
| 132 | namespace libtextclassifier2; |
| 133 | table CompressedBuffer { |
| 134 | buffer:[ubyte]; |
| 135 | uncompressed_size:int; |
| 136 | } |
| 137 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 138 | // Options for the model that predicts text selection. |
| 139 | namespace libtextclassifier2; |
| 140 | table SelectionModelOptions { |
| 141 | // If true, before the selection is returned, the unpaired brackets contained |
| 142 | // in the predicted selection are stripped from the both selection ends. |
| 143 | // The bracket codepoints are defined in the Unicode standard: |
| 144 | // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt |
| 145 | strip_unpaired_brackets:bool = 1; |
| 146 | |
| 147 | // Number of hypothetical click positions on either side of the actual click |
| 148 | // to consider in order to enforce symmetry. |
| 149 | symmetry_context_size:int; |
| 150 | |
| 151 | // Number of examples to bundle in one batch for inference. |
| 152 | batch_size:int = 1024; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 153 | |
| 154 | // Whether to always classify a suggested selection or only on demand. |
| 155 | always_classify_suggested_selection:bool = 0; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 156 | } |
| 157 | |
| 158 | // Options for the model that classifies a text selection. |
| 159 | namespace libtextclassifier2; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 160 | table ClassificationModelOptions { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 161 | // Limits for phone numbers. |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 162 | phone_min_num_digits:int = 7; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 163 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 164 | phone_max_num_digits:int = 15; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 165 | |
| 166 | // Limits for addresses. |
| 167 | address_min_num_tokens:int; |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 168 | |
| 169 | // Maximum number of tokens to attempt a classification (-1 is unlimited). |
| 170 | max_num_tokens:int = -1; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 171 | } |
| 172 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 173 | // List of regular expression matchers to check. |
| 174 | namespace libtextclassifier2.RegexModel_; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 175 | table Pattern { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 176 | // The name of the collection of a match. |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 177 | collection_name:string; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 178 | |
| 179 | // The pattern to check. |
| 180 | // Can specify a single capturing group used as match boundaries. |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 181 | pattern:string; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 182 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 183 | // The modes for which to apply the patterns. |
| 184 | enabled_modes:libtextclassifier2.ModeFlag = ALL; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 185 | |
| 186 | // The final score to assign to the results of this pattern. |
| 187 | target_classification_score:float = 1; |
| 188 | |
Lukas Zilka | df710db | 2018-02-27 12:44:09 +0100 | [diff] [blame] | 189 | // Priority score used for conflict resolution with the other models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 190 | priority_score:float = 0; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 191 | |
| 192 | // If true, will use an approximate matching implementation implemented |
| 193 | // using Find() instead of the true Match(). This approximate matching will |
| 194 | // use the first Find() result and then check that it spans the whole input. |
| 195 | use_approximate_matching:bool = 0; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 196 | |
| 197 | compressed_pattern:libtextclassifier2.CompressedBuffer; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 198 | } |
| 199 | |
| 200 | namespace libtextclassifier2; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 201 | table RegexModel { |
| 202 | patterns:[libtextclassifier2.RegexModel_.Pattern]; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 203 | } |
| 204 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 205 | // List of regex patterns. |
| 206 | namespace libtextclassifier2.DatetimeModelPattern_; |
| 207 | table Regex { |
| 208 | pattern:string; |
| 209 | |
| 210 | // The ith entry specifies the type of the ith capturing group. |
| 211 | // This is used to decide how the matched content has to be parsed. |
| 212 | groups:[libtextclassifier2.DatetimeGroupType]; |
| 213 | |
| 214 | compressed_pattern:libtextclassifier2.CompressedBuffer; |
| 215 | } |
| 216 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 217 | namespace libtextclassifier2; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 218 | table DatetimeModelPattern { |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 219 | regexes:[libtextclassifier2.DatetimeModelPattern_.Regex]; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 220 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 221 | // List of locale indices in DatetimeModel that represent the locales that |
| 222 | // these patterns should be used for. If empty, can be used for all locales. |
| 223 | locales:[int]; |
| 224 | |
| 225 | // The final score to assign to the results of this pattern. |
| 226 | target_classification_score:float = 1; |
| 227 | |
| 228 | // Priority score used for conflict resulution with the other models. |
| 229 | priority_score:float = 0; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 230 | |
| 231 | // The modes for which to apply the patterns. |
| 232 | enabled_modes:libtextclassifier2.ModeFlag = ALL; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 233 | } |
| 234 | |
| 235 | namespace libtextclassifier2; |
| 236 | table DatetimeModelExtractor { |
| 237 | extractor:libtextclassifier2.DatetimeExtractorType; |
| 238 | pattern:string; |
| 239 | locales:[int]; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 240 | compressed_pattern:libtextclassifier2.CompressedBuffer; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 241 | } |
| 242 | |
| 243 | namespace libtextclassifier2; |
| 244 | table DatetimeModel { |
| 245 | // List of BCP 47 locale strings representing all locales supported by the |
| 246 | // model. The individual patterns refer back to them using an index. |
| 247 | locales:[string]; |
| 248 | |
| 249 | patterns:[libtextclassifier2.DatetimeModelPattern]; |
| 250 | extractors:[libtextclassifier2.DatetimeModelExtractor]; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 251 | |
| 252 | // If true, will use the extractors for determining the match location as |
| 253 | // opposed to using the location where the global pattern matched. |
| 254 | use_extractors_for_locating:bool = 1; |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 255 | |
| 256 | // List of locale ids, rules of whose are always run, after the requested |
| 257 | // ones. |
| 258 | default_locales:[int]; |
| 259 | } |
| 260 | |
| 261 | namespace libtextclassifier2.DatetimeModelLibrary_; |
| 262 | table Item { |
| 263 | key:string; |
| 264 | value:libtextclassifier2.DatetimeModel; |
| 265 | } |
| 266 | |
| 267 | // A set of named DateTime models. |
| 268 | namespace libtextclassifier2; |
| 269 | table DatetimeModelLibrary { |
| 270 | models:[libtextclassifier2.DatetimeModelLibrary_.Item]; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 271 | } |
| 272 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 273 | // Options controlling the output of the Tensorflow Lite models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 274 | namespace libtextclassifier2; |
| 275 | table ModelTriggeringOptions { |
| 276 | // Lower bound threshold for filtering annotation model outputs. |
| 277 | min_annotate_confidence:float = 0; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 278 | |
| 279 | // The modes for which to enable the models. |
| 280 | enabled_modes:libtextclassifier2.ModeFlag = ALL; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 281 | } |
| 282 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 283 | // Options controlling the output of the classifier. |
| 284 | namespace libtextclassifier2; |
| 285 | table OutputOptions { |
| 286 | // Lists of collection names that will be filtered out at the output: |
| 287 | // - For annotation, the spans of given collection are simply dropped. |
| 288 | // - For classification, the result is mapped to the class "other". |
| 289 | // - For selection, the spans of given class are returned as |
| 290 | // single-selection. |
| 291 | filtered_collections_annotation:[string]; |
| 292 | |
| 293 | filtered_collections_classification:[string]; |
| 294 | filtered_collections_selection:[string]; |
| 295 | } |
| 296 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 297 | namespace libtextclassifier2; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 298 | table Model { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 299 | // Comma-separated list of locales supported by the model as BCP 47 tags. |
| 300 | locales:string; |
| 301 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 302 | version:int; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 303 | |
| 304 | // A name for the model that can be used for e.g. logging. |
| 305 | name:string; |
| 306 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 307 | selection_feature_options:libtextclassifier2.FeatureProcessorOptions; |
| 308 | classification_feature_options:libtextclassifier2.FeatureProcessorOptions; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 309 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 310 | // Tensorflow Lite models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 311 | selection_model:[ubyte] (force_align: 16); |
| 312 | |
| 313 | classification_model:[ubyte] (force_align: 16); |
| 314 | embedding_model:[ubyte] (force_align: 16); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 315 | |
| 316 | // Options for the different models. |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 317 | selection_options:libtextclassifier2.SelectionModelOptions; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 318 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 319 | classification_options:libtextclassifier2.ClassificationModelOptions; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 320 | regex_model:libtextclassifier2.RegexModel; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 321 | datetime_model:libtextclassifier2.DatetimeModel; |
| 322 | |
| 323 | // Options controlling the output of the models. |
| 324 | triggering_options:libtextclassifier2.ModelTriggeringOptions; |
Lukas Zilka | df710db | 2018-02-27 12:44:09 +0100 | [diff] [blame] | 325 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 326 | // Global switch that controls if SuggestSelection(), ClassifyText() and |
| 327 | // Annotate() will run. If a mode is disabled it returns empty/no-op results. |
| 328 | enabled_modes:libtextclassifier2.ModeFlag = ALL; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 329 | |
| 330 | // If true, will snap the selections that consist only of whitespaces to the |
| 331 | // containing suggested span. Otherwise, no suggestion is proposed, since the |
| 332 | // selections are not part of any token. |
| 333 | snap_whitespace_selections:bool = 1; |
| 334 | |
| 335 | // Global configuration for the output of SuggestSelection(), ClassifyText() |
| 336 | // and Annotate(). |
| 337 | output_options:libtextclassifier2.OutputOptions; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 338 | } |
| 339 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 340 | // Role of the codepoints in the range. |
| 341 | namespace libtextclassifier2.TokenizationCodepointRange_; |
| 342 | enum Role : int { |
| 343 | // Concatenates the codepoint to the current run of codepoints. |
| 344 | DEFAULT_ROLE = 0, |
| 345 | |
| 346 | // Splits a run of codepoints before the current codepoint. |
| 347 | SPLIT_BEFORE = 1, |
| 348 | |
| 349 | // Splits a run of codepoints after the current codepoint. |
| 350 | SPLIT_AFTER = 2, |
| 351 | |
| 352 | // Each codepoint will be a separate token. Good e.g. for Chinese |
| 353 | // characters. |
| 354 | TOKEN_SEPARATOR = 3, |
| 355 | |
| 356 | // Discards the codepoint. |
| 357 | DISCARD_CODEPOINT = 4, |
| 358 | |
| 359 | // Common values: |
| 360 | // Splits on the characters and discards them. Good e.g. for the space |
| 361 | // character. |
| 362 | WHITESPACE_SEPARATOR = 7, |
| 363 | } |
| 364 | |
| 365 | // Represents a codepoint range [start, end) with its role for tokenization. |
| 366 | namespace libtextclassifier2; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 367 | table TokenizationCodepointRange { |
| 368 | start:int; |
| 369 | end:int; |
| 370 | role:libtextclassifier2.TokenizationCodepointRange_.Role; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 371 | |
| 372 | // Integer identifier of the script this range denotes. Negative values are |
| 373 | // reserved for Tokenizer's internal use. |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 374 | script_id:int; |
| 375 | } |
| 376 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 377 | // Method for selecting the center token. |
| 378 | namespace libtextclassifier2.FeatureProcessorOptions_; |
| 379 | enum CenterTokenSelectionMethod : int { |
| 380 | DEFAULT_CENTER_TOKEN_METHOD = 0, |
| 381 | |
| 382 | // Use click indices to determine the center token. |
| 383 | CENTER_TOKEN_FROM_CLICK = 1, |
| 384 | |
| 385 | // Use selection indices to get a token range, and select the middle of it |
| 386 | // as the center token. |
| 387 | CENTER_TOKEN_MIDDLE_OF_SELECTION = 2, |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 388 | } |
| 389 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 390 | // Controls the type of tokenization the model will use for the input text. |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 391 | namespace libtextclassifier2.FeatureProcessorOptions_; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 392 | enum TokenizationType : int { |
| 393 | INVALID_TOKENIZATION_TYPE = 0, |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 394 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 395 | // Use the internal tokenizer for tokenization. |
| 396 | INTERNAL_TOKENIZER = 1, |
| 397 | |
| 398 | // Use ICU for tokenization. |
| 399 | ICU = 2, |
| 400 | |
| 401 | // First apply ICU tokenization. Then identify stretches of tokens |
| 402 | // consisting only of codepoints in internal_tokenizer_codepoint_ranges |
| 403 | // and re-tokenize them using the internal tokenizer. |
| 404 | MIXED = 3, |
| 405 | } |
| 406 | |
| 407 | // Range of codepoints start - end, where end is exclusive. |
| 408 | namespace libtextclassifier2.FeatureProcessorOptions_; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 409 | table CodepointRange { |
| 410 | start:int; |
| 411 | end:int; |
| 412 | } |
| 413 | |
Lukas Zilka | df710db | 2018-02-27 12:44:09 +0100 | [diff] [blame] | 414 | // Bounds-sensitive feature extraction configuration. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 415 | namespace libtextclassifier2.FeatureProcessorOptions_; |
| 416 | table BoundsSensitiveFeatures { |
| 417 | // Enables the extraction of bounds-sensitive features, instead of the click |
| 418 | // context features. |
| 419 | enabled:bool; |
| 420 | |
| 421 | // The numbers of tokens to extract in specific locations relative to the |
| 422 | // bounds. |
| 423 | // Immediately before the span. |
| 424 | num_tokens_before:int; |
| 425 | |
| 426 | // Inside the span, aligned with the beginning. |
| 427 | num_tokens_inside_left:int; |
| 428 | |
| 429 | // Inside the span, aligned with the end. |
| 430 | num_tokens_inside_right:int; |
| 431 | |
| 432 | // Immediately after the span. |
| 433 | num_tokens_after:int; |
| 434 | |
| 435 | // If true, also extracts the tokens of the entire span and adds up their |
| 436 | // features forming one "token" to include in the extracted features. |
| 437 | include_inside_bag:bool; |
| 438 | |
| 439 | // If true, includes the selection length (in the number of tokens) as a |
| 440 | // feature. |
| 441 | include_inside_length:bool; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 442 | |
| 443 | // If true, for selection, single token spans are not run through the model |
| 444 | // and their score is assumed to be zero. |
| 445 | score_single_token_spans_as_zero:bool; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 446 | } |
| 447 | |
| 448 | namespace libtextclassifier2.FeatureProcessorOptions_; |
| 449 | table AlternativeCollectionMapEntry { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 450 | key:string; |
| 451 | value:string; |
| 452 | } |
| 453 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 454 | namespace libtextclassifier2; |
| 455 | table FeatureProcessorOptions { |
| 456 | // Number of buckets used for hashing charactergrams. |
| 457 | num_buckets:int = -1; |
| 458 | |
| 459 | // Size of the embedding. |
| 460 | embedding_size:int = -1; |
| 461 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 462 | // Number of bits for quantization for embeddings. |
| 463 | embedding_quantization_bits:int = 8; |
| 464 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 465 | // Context size defines the number of words to the left and to the right of |
| 466 | // the selected word to be used as context. For example, if context size is |
| 467 | // N, then we take N words to the left and N words to the right of the |
| 468 | // selected word as its context. |
| 469 | context_size:int = -1; |
| 470 | |
| 471 | // Maximum number of words of the context to select in total. |
| 472 | max_selection_span:int = -1; |
| 473 | |
| 474 | // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 |
| 475 | // character trigrams etc. |
| 476 | chargram_orders:[int]; |
| 477 | |
| 478 | // Maximum length of a word, in codepoints. |
| 479 | max_word_length:int = 20; |
| 480 | |
| 481 | // If true, will use the unicode-aware functionality for extracting features. |
| 482 | unicode_aware_features:bool = 0; |
| 483 | |
| 484 | // Whether to extract the token case feature. |
| 485 | extract_case_feature:bool = 0; |
| 486 | |
| 487 | // Whether to extract the selection mask feature. |
| 488 | extract_selection_mask_feature:bool = 0; |
| 489 | |
| 490 | // List of regexps to run over each token. For each regexp, if there is a |
| 491 | // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. |
| 492 | regexp_feature:[string]; |
| 493 | |
| 494 | // Whether to remap all digits to a single number. |
| 495 | remap_digits:bool = 0; |
| 496 | |
| 497 | // Whether to lower-case each token before generating hashgrams. |
| 498 | lowercase_tokens:bool; |
| 499 | |
| 500 | // If true, the selection classifier output will contain only the selections |
| 501 | // that are feasible (e.g., those that are shorter than max_selection_span), |
| 502 | // if false, the output will be a complete cross-product of possible |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 503 | // selections to the left and possible selections to the right, including the |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 504 | // infeasible ones. |
| 505 | // NOTE: Exists mainly for compatibility with older models that were trained |
| 506 | // with the non-reduced output space. |
| 507 | selection_reduced_output_space:bool = 1; |
| 508 | |
| 509 | // Collection names. |
| 510 | collections:[string]; |
| 511 | |
| 512 | // An index of collection in collections to be used if a collection name can't |
| 513 | // be mapped to an id. |
| 514 | default_collection:int = -1; |
| 515 | |
| 516 | // If true, will split the input by lines, and only use the line that contains |
| 517 | // the clicked token. |
| 518 | only_use_line_with_click:bool = 0; |
| 519 | |
| 520 | // If true, will split tokens that contain the selection boundary, at the |
| 521 | // position of the boundary. |
| 522 | // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" |
| 523 | split_tokens_on_selection_boundaries:bool = 0; |
| 524 | |
| 525 | // Codepoint ranges that determine how different codepoints are tokenized. |
| 526 | // The ranges must not overlap. |
| 527 | tokenization_codepoint_config:[libtextclassifier2.TokenizationCodepointRange]; |
| 528 | |
| 529 | center_token_selection_method:libtextclassifier2.FeatureProcessorOptions_.CenterTokenSelectionMethod; |
| 530 | |
| 531 | // If true, span boundaries will be snapped to containing tokens and not |
| 532 | // required to exactly match token boundaries. |
| 533 | snap_label_span_boundaries_to_containing_tokens:bool; |
| 534 | |
| 535 | // A set of codepoint ranges supported by the model. |
| 536 | supported_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange]; |
| 537 | |
| 538 | // A set of codepoint ranges to use in the mixed tokenization mode to identify |
| 539 | // stretches of tokens to re-tokenize using the internal tokenizer. |
| 540 | internal_tokenizer_codepoint_ranges:[libtextclassifier2.FeatureProcessorOptions_.CodepointRange]; |
| 541 | |
| 542 | // Minimum ratio of supported codepoints in the input context. If the ratio |
| 543 | // is lower than this, the feature computation will fail. |
| 544 | min_supported_codepoint_ratio:float = 0; |
| 545 | |
| 546 | // Used for versioning the format of features the model expects. |
| 547 | // - feature_version == 0: |
| 548 | // For each token the features consist of: |
| 549 | // - chargram embeddings |
| 550 | // - dense features |
| 551 | // Chargram embeddings for tokens are concatenated first together, |
| 552 | // and at the end, the dense features for the tokens are concatenated |
| 553 | // to it. So the resulting feature vector has two regions. |
| 554 | feature_version:int = 0; |
| 555 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 556 | tokenization_type:libtextclassifier2.FeatureProcessorOptions_.TokenizationType = INTERNAL_TOKENIZER; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 557 | icu_preserve_whitespace_tokens:bool = 0; |
| 558 | |
| 559 | // List of codepoints that will be stripped from beginning and end of |
| 560 | // predicted spans. |
| 561 | ignored_span_boundary_codepoints:[int]; |
| 562 | |
| 563 | bounds_sensitive_features:libtextclassifier2.FeatureProcessorOptions_.BoundsSensitiveFeatures; |
| 564 | |
| 565 | // List of allowed charactergrams. The extracted charactergrams are filtered |
| 566 | // using this list, and charactergrams that are not present are interpreted as |
| 567 | // out-of-vocabulary. |
| 568 | // If no allowed_chargrams are specified, all charactergrams are allowed. |
| 569 | // The field is typed as bytes type to allow non-UTF8 chargrams. |
| 570 | allowed_chargrams:[string]; |
| 571 | |
| 572 | // If true, tokens will be also split when the codepoint's script_id changes |
| 573 | // as defined in TokenizationCodepointRange. |
| 574 | tokenize_on_script_change:bool = 0; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 575 | } |
| 576 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 577 | root_type libtextclassifier2.Model; |