Fixes utf8 handling, datetime model and flight number model, makes models smaller by compressing the regex rules, and adds i18n models. (sync from google3) Test: bit FrameworksCoreTests:android.view.textclassifier.TextClassificationManagerTest Test: bit CtsViewTestCases:android.view.textclassifier.cts.TextClassificationManagerTest Bug: 64929062 Bug: 77223425 Change-Id: I04472e6b247e824bf2b745077c50fcde4269aefc

commit: e7962cca83035d93ca32912c47f46a1c5a4ef016 [log] [tgz]
author: Lukas Zilka <zilka@google.com> Wed Mar 28 18:09:48 2018 +0200
committer: Lukas Zilka <zilka@google.com> Thu Mar 29 21:50:00 2018 +0200
tree: 85e0cd399ce772a9792b27b340c1956bc693de86
parent: ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40b [diff] [blame]
diff --git a/model.fbs b/model.fbs
index 590c815..23cf229 100755
--- a/model.fbs
+++ b/model.fbs

@@ -106,6 +106,35 @@
   THOUSAND = 72,
 }
 
+namespace libtextclassifier2;
+enum DatetimeGroupType : int {
+  GROUP_UNKNOWN = 0,
+  GROUP_UNUSED = 1,
+  GROUP_YEAR = 2,
+  GROUP_MONTH = 3,
+  GROUP_DAY = 4,
+  GROUP_HOUR = 5,
+  GROUP_MINUTE = 6,
+  GROUP_SECOND = 7,
+  GROUP_AMPM = 8,
+  GROUP_RELATIONDISTANCE = 9,
+  GROUP_RELATION = 10,
+  GROUP_RELATIONTYPE = 11,
+
+  // Dummy groups serve just as an inflator of the selection. E.g. we might want
+  // to select more text than was contained in an envelope of all extractor
+  // spans.
+  GROUP_DUMMY1 = 12,
+
+  GROUP_DUMMY2 = 13,
+}
+
+namespace libtextclassifier2;
+table CompressedBuffer {
+  buffer:[ubyte];
+  uncompressed_size:int;
+}
+
 // Options for the model that predicts text selection.
 namespace libtextclassifier2;
 table SelectionModelOptions {
@@ -121,6 +150,9 @@
 
   // Number of examples to bundle in one batch for inference.
   batch_size:int = 1024;
+
+  // Whether to always classify a suggested selection or only on demand.
+  always_classify_suggested_selection:bool = 0;
 }
 
 // Options for the model that classifies a text selection.
@@ -130,6 +162,9 @@
   phone_min_num_digits:int = 7;
 
   phone_max_num_digits:int = 15;
+
+  // Limits for addresses.
+  address_min_num_tokens:int;
 }
 
 // List of regular expression matchers to check.
@@ -155,6 +190,8 @@
   // using Find() instead of the true Match(). This approximate matching will
   // use the first Find() result and then check that it spans the whole input.
   use_approximate_matching:bool = 0;
+
+  compressed_pattern:libtextclassifier2.CompressedBuffer;
 }
 
 namespace libtextclassifier2;
@@ -162,10 +199,21 @@
   patterns:[libtextclassifier2.RegexModel_.Pattern];
 }
 
+// List of regex patterns.
+namespace libtextclassifier2.DatetimeModelPattern_;
+table Regex {
+  pattern:string;
+
+  // The ith entry specifies the type of the ith capturing group.
+  // This is used to decide how the matched content has to be parsed.
+  groups:[libtextclassifier2.DatetimeGroupType];
+
+  compressed_pattern:libtextclassifier2.CompressedBuffer;
+}
+
 namespace libtextclassifier2;
 table DatetimeModelPattern {
-  // List of regex patterns.
-  regexes:[string];
+  regexes:[libtextclassifier2.DatetimeModelPattern_.Regex];
 
   // List of locale indices in DatetimeModel that represent the locales that
   // these patterns should be used for. If empty, can be used for all locales.
@@ -186,6 +234,7 @@
   extractor:libtextclassifier2.DatetimeExtractorType;
   pattern:string;
   locales:[int];
+  compressed_pattern:libtextclassifier2.CompressedBuffer;
 }
 
 namespace libtextclassifier2;
@@ -212,6 +261,20 @@
   enabled_modes:libtextclassifier2.ModeFlag = ALL;
 }
 
+// Options controlling the output of the classifier.
+namespace libtextclassifier2;
+table OutputOptions {
+  // Lists of collection names that will be filtered out at the output:
+  // - For annotation, the spans of given collection are simply dropped.
+  // - For classification, the result is mapped to the class "other".
+  // - For selection, the spans of given class are returned as
+  // single-selection.
+  filtered_collections_annotation:[string];
+
+  filtered_collections_classification:[string];
+  filtered_collections_selection:[string];
+}
+
 namespace libtextclassifier2;
 table Model {
   // Comma-separated list of locales supported by the model as BCP 47 tags.
@@ -244,6 +307,15 @@
   // Global switch that controls if SuggestSelection(), ClassifyText() and
   // Annotate() will run. If a mode is disabled it returns empty/no-op results.
   enabled_modes:libtextclassifier2.ModeFlag = ALL;
+
+  // If true, will snap the selections that consist only of whitespaces to the
+  // containing suggested span. Otherwise, no suggestion is proposed, since the
+  // selections are not part of any token.
+  snap_whitespace_selections:bool = 1;
+
+  // Global configuration for the output of SuggestSelection(), ClassifyText()
+  // and Annotate().
+  output_options:libtextclassifier2.OutputOptions;
 }
 
 // Role of the codepoints in the range.
@@ -409,7 +481,7 @@
   // If true, the selection classifier output will contain only the selections
   // that are feasible (e.g., those that are shorter than max_selection_span),
   // if false, the output will be a complete cross-product of possible
-  // selections to the left and posible selections to the right, including the
+  // selections to the left and possible selections to the right, including the
   // infeasible ones.
   // NOTE: Exists mainly for compatibility with older models that were trained
   // with the non-reduced output space.
commit	e7962cca83035d93ca32912c47f46a1c5a4ef016	[log] [tgz]
author	Lukas Zilka <zilka@google.com>	Wed Mar 28 18:09:48 2018 +0200
committer	Lukas Zilka <zilka@google.com>	Thu Mar 29 21:50:00 2018 +0200
tree	85e0cd399ce772a9792b27b340c1956bc693de86
parent	ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40b [diff] [blame]