Fixes utf8 handling, datetime model and flight number model, makes
models smaller by compressing the regex rules, and adds i18n models.

(sync from google3)

Test: bit FrameworksCoreTests:android.view.textclassifier.TextClassificationManagerTest
Test: bit CtsViewTestCases:android.view.textclassifier.cts.TextClassificationManagerTest

Bug: 64929062
Bug: 77223425

Change-Id: I04472e6b247e824bf2b745077c50fcde4269aefc
diff --git a/datetime/parser_test.cc b/datetime/parser_test.cc
index 1df959f..36525e2 100644
--- a/datetime/parser_test.cc
+++ b/datetime/parser_test.cc
@@ -25,6 +25,7 @@
 
 #include "datetime/parser.h"
 #include "model_generated.h"
+#include "text-classifier.h"
 #include "types-test-util.h"
 
 using testing::ElementsAreArray;
@@ -54,15 +55,27 @@
  public:
   void SetUp() override {
     model_buffer_ = ReadFile(GetModelPath() + "test_model.fb");
-    const Model* model = GetModel(model_buffer_.data());
-    ASSERT_TRUE(model != nullptr);
-    ASSERT_TRUE(model->datetime_model() != nullptr);
-    parser_ = DatetimeParser::Instance(model->datetime_model(), unilib_);
+    classifier_ = TextClassifier::FromUnownedBuffer(
+        model_buffer_.data(), model_buffer_.size(), &unilib_);
+    TC_CHECK(classifier_);
+    parser_ = classifier_->DatetimeParserForTests();
+  }
+
+  bool HasNoResult(const std::string& text, bool anchor_start_end = false,
+                   const std::string& timezone = "Europe/Zurich") {
+    std::vector<DatetimeParseResultSpan> results;
+    if (!parser_->Parse(text, 0, timezone, /*locales=*/"", ModeFlag_ANNOTATION,
+                        anchor_start_end, &results)) {
+      TC_LOG(ERROR) << text;
+      TC_CHECK(false);
+    }
+    return results.empty();
   }
 
   bool ParsesCorrectly(const std::string& marked_text,
                        const int64 expected_ms_utc,
                        DatetimeGranularity expected_granularity,
+                       bool anchor_start_end = false,
                        const std::string& timezone = "Europe/Zurich") {
     auto expected_start_index = marked_text.find("{");
     EXPECT_TRUE(expected_start_index != std::string::npos);
@@ -80,7 +93,7 @@
     std::vector<DatetimeParseResultSpan> results;
 
     if (!parser_->Parse(text, 0, timezone, /*locales=*/"", ModeFlag_ANNOTATION,
-                        &results)) {
+                        anchor_start_end, &results)) {
       TC_LOG(ERROR) << text;
       TC_CHECK(false);
     }
@@ -115,7 +128,8 @@
 
  protected:
   std::string model_buffer_;
-  std::unique_ptr<DatetimeParser> parser_;
+  std::unique_ptr<TextClassifier> classifier_;
+  const DatetimeParser* parser_;
   UniLib unilib_;
 };
 
@@ -129,7 +143,7 @@
 TEST_F(ParserTest, Parse) {
   EXPECT_TRUE(
       ParsesCorrectly("{January 1, 1988}", 567990000000, GRANULARITY_DAY));
-  EXPECT_TRUE(ParsesCorrectly("{1 2 2018}", 1517439600000, GRANULARITY_DAY));
+  EXPECT_TRUE(ParsesCorrectly("{1 2 2018}", 1514847600000, GRANULARITY_DAY));
   EXPECT_TRUE(
       ParsesCorrectly("{january 31 2018}", 1517353200000, GRANULARITY_DAY));
   EXPECT_TRUE(ParsesCorrectly("lorem {1 january 2018} ipsum", 1514761200000,
@@ -205,6 +219,7 @@
 
   EXPECT_TRUE(ParsesCorrectly("{today}", -3600000, GRANULARITY_DAY));
   EXPECT_TRUE(ParsesCorrectly("{today}", -57600000, GRANULARITY_DAY,
+                              /*anchor_start_end=*/false,
                               "America/Los_Angeles"));
   EXPECT_TRUE(ParsesCorrectly("{next week}", 255600000, GRANULARITY_WEEK));
   EXPECT_TRUE(ParsesCorrectly("{next day}", 82800000, GRANULARITY_DAY));
@@ -224,7 +239,103 @@
   EXPECT_TRUE(ParsesCorrectly("{three days ago}", -262800000, GRANULARITY_DAY));
 }
 
-// TODO(zilka): Add a test that tests multiple locales.
+TEST_F(ParserTest, ParseWithAnchor) {
+  EXPECT_TRUE(ParsesCorrectly("{January 1, 1988}", 567990000000,
+                              GRANULARITY_DAY, /*anchor_start_end=*/false));
+  EXPECT_TRUE(ParsesCorrectly("{January 1, 1988}", 567990000000,
+                              GRANULARITY_DAY, /*anchor_start_end=*/true));
+  EXPECT_TRUE(ParsesCorrectly("lorem {1 january 2018} ipsum", 1514761200000,
+                              GRANULARITY_DAY, /*anchor_start_end=*/false));
+  EXPECT_TRUE(HasNoResult("lorem 1 january 2018 ipsum",
+                          /*anchor_start_end=*/true));
+}
+
+class ParserLocaleTest : public testing::Test {
+ public:
+  void SetUp() override;
+  bool HasResult(const std::string& input, const std::string& locales);
+
+ protected:
+  UniLib unilib_;
+  flatbuffers::FlatBufferBuilder builder_;
+  std::unique_ptr<DatetimeParser> parser_;
+};
+
+void AddPattern(const std::string& regex, int locale,
+                std::vector<std::unique_ptr<DatetimeModelPatternT>>* patterns) {
+  patterns->emplace_back(new DatetimeModelPatternT);
+  patterns->back()->regexes.emplace_back(new DatetimeModelPattern_::RegexT);
+  patterns->back()->regexes.back()->pattern = regex;
+  patterns->back()->regexes.back()->groups.push_back(
+      DatetimeGroupType_GROUP_UNUSED);
+  patterns->back()->locales.push_back(locale);
+}
+
+void ParserLocaleTest::SetUp() {
+  DatetimeModelT model;
+  model.use_extractors_for_locating = false;
+  model.locales.clear();
+  model.locales.push_back("en-US");
+  model.locales.push_back("en-CH");
+  model.locales.push_back("zh-Hant");
+  model.locales.push_back("en-*");
+  model.locales.push_back("zh-Hant-*");
+  model.locales.push_back("*-CH");
+  model.locales.push_back("");
+
+  AddPattern(/*regex=*/"en-US", /*locale=*/0, &model.patterns);
+  AddPattern(/*regex=*/"en-CH", /*locale=*/1, &model.patterns);
+  AddPattern(/*regex=*/"zh-Hant", /*locale=*/2, &model.patterns);
+  AddPattern(/*regex=*/"en-all", /*locale=*/3, &model.patterns);
+  AddPattern(/*regex=*/"zh-Hant-all", /*locale=*/4, &model.patterns);
+  AddPattern(/*regex=*/"all-CH", /*locale=*/5, &model.patterns);
+  AddPattern(/*regex=*/"default", /*locale=*/6, &model.patterns);
+
+  builder_.Finish(DatetimeModel::Pack(builder_, &model));
+  const DatetimeModel* model_fb =
+      flatbuffers::GetRoot<DatetimeModel>(builder_.GetBufferPointer());
+  ASSERT_TRUE(model_fb);
+
+  parser_ = DatetimeParser::Instance(model_fb, unilib_,
+                                     /*decompressor=*/nullptr);
+  ASSERT_TRUE(parser_);
+}
+
+bool ParserLocaleTest::HasResult(const std::string& input,
+                                 const std::string& locales) {
+  std::vector<DatetimeParseResultSpan> results;
+  EXPECT_TRUE(parser_->Parse(input, /*reference_time_ms_utc=*/0,
+                             /*reference_timezone=*/"", locales,
+                             ModeFlag_ANNOTATION, false, &results));
+  return results.size() == 1;
+}
+
+TEST_F(ParserLocaleTest, English) {
+  EXPECT_TRUE(HasResult("en-US", /*locales=*/"en-US"));
+  EXPECT_FALSE(HasResult("en-CH", /*locales=*/"en-US"));
+  EXPECT_FALSE(HasResult("en-US", /*locales=*/"en-CH"));
+  EXPECT_TRUE(HasResult("en-CH", /*locales=*/"en-CH"));
+  EXPECT_TRUE(HasResult("default", /*locales=*/"en-CH"));
+}
+
+TEST_F(ParserLocaleTest, TraditionalChinese) {
+  EXPECT_TRUE(HasResult("zh-Hant-all", /*locales=*/"zh-Hant"));
+  EXPECT_TRUE(HasResult("zh-Hant-all", /*locales=*/"zh-Hant-TW"));
+  EXPECT_TRUE(HasResult("zh-Hant-all", /*locales=*/"zh-Hant-SG"));
+  EXPECT_FALSE(HasResult("zh-Hant-all", /*locales=*/"zh-SG"));
+  EXPECT_FALSE(HasResult("zh-Hant-all", /*locales=*/"zh"));
+  EXPECT_TRUE(HasResult("default", /*locales=*/"zh"));
+  EXPECT_TRUE(HasResult("default", /*locales=*/"zh-Hant-SG"));
+}
+
+TEST_F(ParserLocaleTest, SwissEnglish) {
+  EXPECT_TRUE(HasResult("all-CH", /*locales=*/"de-CH"));
+  EXPECT_TRUE(HasResult("all-CH", /*locales=*/"en-CH"));
+  EXPECT_TRUE(HasResult("en-all", /*locales=*/"en-CH"));
+  EXPECT_FALSE(HasResult("all-CH", /*locales=*/"de-DE"));
+  EXPECT_TRUE(HasResult("default", /*locales=*/"de-CH"));
+  EXPECT_TRUE(HasResult("default", /*locales=*/"en-CH"));
+}
 
 }  // namespace
 }  // namespace libtextclassifier2