Sync of lib2 to AOSP.

Model comes from experiment: 2524_BoundsEnglishv5_R1

Bug: 68239358
Test: Builds & tested on device.
Change-Id: I65cb7f0b067b68e3e1c22ee87232555887446089
diff --git a/token-feature-extractor_test.cc b/token-feature-extractor_test.cc
new file mode 100644
index 0000000..d6e48bb
--- /dev/null
+++ b/token-feature-extractor_test.cc
@@ -0,0 +1,564 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "token-feature-extractor.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace libtextclassifier2 {
+namespace {
+
+class TestingTokenFeatureExtractor : public TokenFeatureExtractor {
+ public:
+  using TokenFeatureExtractor::HashToken;
+  using TokenFeatureExtractor::TokenFeatureExtractor;
+};
+
+TEST(TokenFeatureExtractorTest, ExtractAscii) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2, 3};
+  options.extract_case_feature = true;
+  options.unicode_aware_features = false;
+  options.extract_selection_mask_feature = true;
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+
+  extractor.Extract(Token{"Hello", 0, 5}, true, &sparse_features,
+                    &dense_features);
+
+  EXPECT_THAT(sparse_features,
+              testing::ElementsAreArray({
+                  // clang-format off
+                  extractor.HashToken("H"),
+                  extractor.HashToken("e"),
+                  extractor.HashToken("l"),
+                  extractor.HashToken("l"),
+                  extractor.HashToken("o"),
+                  extractor.HashToken("^H"),
+                  extractor.HashToken("He"),
+                  extractor.HashToken("el"),
+                  extractor.HashToken("ll"),
+                  extractor.HashToken("lo"),
+                  extractor.HashToken("o$"),
+                  extractor.HashToken("^He"),
+                  extractor.HashToken("Hel"),
+                  extractor.HashToken("ell"),
+                  extractor.HashToken("llo"),
+                  extractor.HashToken("lo$")
+                  // clang-format on
+              }));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
+
+  sparse_features.clear();
+  dense_features.clear();
+  extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+                    &dense_features);
+
+  EXPECT_THAT(sparse_features,
+              testing::ElementsAreArray({
+                  // clang-format off
+                  extractor.HashToken("w"),
+                  extractor.HashToken("o"),
+                  extractor.HashToken("r"),
+                  extractor.HashToken("l"),
+                  extractor.HashToken("d"),
+                  extractor.HashToken("!"),
+                  extractor.HashToken("^w"),
+                  extractor.HashToken("wo"),
+                  extractor.HashToken("or"),
+                  extractor.HashToken("rl"),
+                  extractor.HashToken("ld"),
+                  extractor.HashToken("d!"),
+                  extractor.HashToken("!$"),
+                  extractor.HashToken("^wo"),
+                  extractor.HashToken("wor"),
+                  extractor.HashToken("orl"),
+                  extractor.HashToken("rld"),
+                  extractor.HashToken("ld!"),
+                  extractor.HashToken("d!$"),
+                  // clang-format on
+              }));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
+}
+
+TEST(TokenFeatureExtractorTest, ExtractAsciiNoChargrams) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{};
+  options.extract_case_feature = true;
+  options.unicode_aware_features = false;
+  options.extract_selection_mask_feature = true;
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+
+  extractor.Extract(Token{"Hello", 0, 5}, true, &sparse_features,
+                    &dense_features);
+
+  EXPECT_THAT(sparse_features,
+              testing::ElementsAreArray({extractor.HashToken("^Hello$")}));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
+
+  sparse_features.clear();
+  dense_features.clear();
+  extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+                    &dense_features);
+
+  EXPECT_THAT(sparse_features,
+              testing::ElementsAreArray({extractor.HashToken("^world!$")}));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
+}
+
+TEST(TokenFeatureExtractorTest, ExtractUnicode) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2, 3};
+  options.extract_case_feature = true;
+  options.unicode_aware_features = true;
+  options.extract_selection_mask_feature = true;
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+
+  extractor.Extract(Token{"Hělló", 0, 5}, true, &sparse_features,
+                    &dense_features);
+
+  EXPECT_THAT(sparse_features,
+              testing::ElementsAreArray({
+                  // clang-format off
+                  extractor.HashToken("H"),
+                  extractor.HashToken("ě"),
+                  extractor.HashToken("l"),
+                  extractor.HashToken("l"),
+                  extractor.HashToken("ó"),
+                  extractor.HashToken("^H"),
+                  extractor.HashToken("Hě"),
+                  extractor.HashToken("ěl"),
+                  extractor.HashToken("ll"),
+                  extractor.HashToken("ló"),
+                  extractor.HashToken("ó$"),
+                  extractor.HashToken("^Hě"),
+                  extractor.HashToken("Hěl"),
+                  extractor.HashToken("ěll"),
+                  extractor.HashToken("lló"),
+                  extractor.HashToken("ló$")
+                  // clang-format on
+              }));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
+
+  sparse_features.clear();
+  dense_features.clear();
+  extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+                    &dense_features);
+
+  EXPECT_THAT(sparse_features,
+              testing::ElementsAreArray({
+                  // clang-format off
+                  extractor.HashToken("w"),
+                  extractor.HashToken("o"),
+                  extractor.HashToken("r"),
+                  extractor.HashToken("l"),
+                  extractor.HashToken("d"),
+                  extractor.HashToken("!"),
+                  extractor.HashToken("^w"),
+                  extractor.HashToken("wo"),
+                  extractor.HashToken("or"),
+                  extractor.HashToken("rl"),
+                  extractor.HashToken("ld"),
+                  extractor.HashToken("d!"),
+                  extractor.HashToken("!$"),
+                  extractor.HashToken("^wo"),
+                  extractor.HashToken("wor"),
+                  extractor.HashToken("orl"),
+                  extractor.HashToken("rld"),
+                  extractor.HashToken("ld!"),
+                  extractor.HashToken("d!$"),
+                  // clang-format on
+              }));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
+}
+
+TEST(TokenFeatureExtractorTest, ExtractUnicodeNoChargrams) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{};
+  options.extract_case_feature = true;
+  options.unicode_aware_features = true;
+  options.extract_selection_mask_feature = true;
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+
+  extractor.Extract(Token{"Hělló", 0, 5}, true, &sparse_features,
+                    &dense_features);
+
+  EXPECT_THAT(sparse_features,
+              testing::ElementsAreArray({extractor.HashToken("^Hělló$")}));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
+
+  sparse_features.clear();
+  dense_features.clear();
+  extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+                    &dense_features);
+
+  EXPECT_THAT(sparse_features, testing::ElementsAreArray({
+                                   extractor.HashToken("^world!$"),
+                               }));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
+}
+
+#ifdef LIBTEXTCLASSIFIER_TEST_ICU
+TEST(TokenFeatureExtractorTest, ICUCaseFeature) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2};
+  options.extract_case_feature = true;
+  options.unicode_aware_features = true;
+  options.extract_selection_mask_feature = false;
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+  extractor.Extract(Token{"Hělló", 0, 5}, true, &sparse_features,
+                    &dense_features);
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0}));
+
+  sparse_features.clear();
+  dense_features.clear();
+  extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+                    &dense_features);
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0}));
+
+  sparse_features.clear();
+  dense_features.clear();
+  extractor.Extract(Token{"Ř", 23, 29}, false, &sparse_features,
+                    &dense_features);
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0}));
+
+  sparse_features.clear();
+  dense_features.clear();
+  extractor.Extract(Token{"ř", 23, 29}, false, &sparse_features,
+                    &dense_features);
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0}));
+}
+#endif
+
+TEST(TokenFeatureExtractorTest, DigitRemapping) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2};
+  options.remap_digits = true;
+  options.unicode_aware_features = false;
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+  extractor.Extract(Token{"9:30am", 0, 6}, true, &sparse_features,
+                    &dense_features);
+
+  std::vector<int> sparse_features2;
+  extractor.Extract(Token{"5:32am", 0, 6}, true, &sparse_features2,
+                    &dense_features);
+  EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+
+  extractor.Extract(Token{"10:32am", 0, 6}, true, &sparse_features2,
+                    &dense_features);
+  EXPECT_THAT(sparse_features,
+              testing::Not(testing::ElementsAreArray(sparse_features2)));
+}
+
+TEST(TokenFeatureExtractorTest, DigitRemappingUnicode) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2};
+  options.remap_digits = true;
+  options.unicode_aware_features = true;
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+  extractor.Extract(Token{"9:30am", 0, 6}, true, &sparse_features,
+                    &dense_features);
+
+  std::vector<int> sparse_features2;
+  extractor.Extract(Token{"5:32am", 0, 6}, true, &sparse_features2,
+                    &dense_features);
+  EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+
+  extractor.Extract(Token{"10:32am", 0, 6}, true, &sparse_features2,
+                    &dense_features);
+  EXPECT_THAT(sparse_features,
+              testing::Not(testing::ElementsAreArray(sparse_features2)));
+}
+
+TEST(TokenFeatureExtractorTest, LowercaseAscii) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2};
+  options.lowercase_tokens = true;
+  options.unicode_aware_features = false;
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+  extractor.Extract(Token{"AABB", 0, 6}, true, &sparse_features,
+                    &dense_features);
+
+  std::vector<int> sparse_features2;
+  extractor.Extract(Token{"aaBB", 0, 6}, true, &sparse_features2,
+                    &dense_features);
+  EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+
+  extractor.Extract(Token{"aAbB", 0, 6}, true, &sparse_features2,
+                    &dense_features);
+  EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+}
+
+#ifdef LIBTEXTCLASSIFIER_TEST_ICU
+TEST(TokenFeatureExtractorTest, LowercaseUnicode) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2};
+  options.lowercase_tokens = true;
+  options.unicode_aware_features = true;
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+  extractor.Extract(Token{"ŘŘ", 0, 6}, true, &sparse_features, &dense_features);
+
+  std::vector<int> sparse_features2;
+  extractor.Extract(Token{"řř", 0, 6}, true, &sparse_features2,
+                    &dense_features);
+  EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+}
+#endif
+
+#ifdef LIBTEXTCLASSIFIER_TEST_ICU
+TEST(TokenFeatureExtractorTest, RegexFeatures) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2};
+  options.remap_digits = false;
+  options.unicode_aware_features = false;
+  options.regexp_features.push_back("^[a-z]+$");  // all lower case.
+  options.regexp_features.push_back("^[0-9]+$");  // all digits.
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+  extractor.Extract(Token{"abCde", 0, 6}, true, &sparse_features,
+                    &dense_features);
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
+
+  dense_features.clear();
+  extractor.Extract(Token{"abcde", 0, 6}, true, &sparse_features,
+                    &dense_features);
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, -1.0}));
+
+  dense_features.clear();
+  extractor.Extract(Token{"12c45", 0, 6}, true, &sparse_features,
+                    &dense_features);
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, -1.0}));
+
+  dense_features.clear();
+  extractor.Extract(Token{"12345", 0, 6}, true, &sparse_features,
+                    &dense_features);
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 1.0}));
+}
+#endif
+
+TEST(TokenFeatureExtractorTest, ExtractTooLongWord) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{22};
+  options.extract_case_feature = true;
+  options.unicode_aware_features = true;
+  options.extract_selection_mask_feature = true;
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  // Test that this runs. ASAN should catch problems.
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+  extractor.Extract(Token{"abcdefghijklmnopqřstuvwxyz", 0, 0}, true,
+                    &sparse_features, &dense_features);
+
+  EXPECT_THAT(sparse_features,
+              testing::ElementsAreArray({
+                  // clang-format off
+                  extractor.HashToken("^abcdefghij\1qřstuvwxyz"),
+                  extractor.HashToken("abcdefghij\1qřstuvwxyz$"),
+                  // clang-format on
+              }));
+}
+
+TEST(TokenFeatureExtractorTest, ExtractAsciiUnicodeMatches) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2, 3, 4, 5};
+  options.extract_case_feature = true;
+  options.unicode_aware_features = true;
+  options.extract_selection_mask_feature = true;
+
+  UniLib unilib;
+  TestingTokenFeatureExtractor extractor_unicode(options, unilib);
+
+  options.unicode_aware_features = false;
+  TestingTokenFeatureExtractor extractor_ascii(options, unilib);
+
+  for (const std::string& input :
+       {"https://www.abcdefgh.com/in/xxxkkkvayio",
+        "https://www.fjsidofj.om/xx/abadfy/xxxx/?xfjiis=ffffiijiihil",
+        "asdfhasdofjiasdofj#%()*%#*(aisdojfaosdifjiaofjdsiofjdi_fdis3w", "abcd",
+        "x", "Hello", "Hey,", "Hi", ""}) {
+    std::vector<int> sparse_features_unicode;
+    std::vector<float> dense_features_unicode;
+    extractor_unicode.Extract(Token{input, 0, 0}, true,
+                              &sparse_features_unicode,
+                              &dense_features_unicode);
+
+    std::vector<int> sparse_features_ascii;
+    std::vector<float> dense_features_ascii;
+    extractor_ascii.Extract(Token{input, 0, 0}, true, &sparse_features_ascii,
+                            &dense_features_ascii);
+
+    EXPECT_THAT(sparse_features_unicode, sparse_features_ascii) << input;
+    EXPECT_THAT(dense_features_unicode, dense_features_ascii) << input;
+  }
+}
+
+TEST(TokenFeatureExtractorTest, ExtractForPadToken) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2};
+  options.extract_case_feature = true;
+  options.unicode_aware_features = false;
+  options.extract_selection_mask_feature = true;
+
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+
+  extractor.Extract(Token(), false, &sparse_features, &dense_features);
+
+  EXPECT_THAT(sparse_features,
+              testing::ElementsAreArray({extractor.HashToken("<PAD>")}));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
+}
+
+TEST(TokenFeatureExtractorTest, ExtractFiltered) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2, 3};
+  options.extract_case_feature = true;
+  options.unicode_aware_features = false;
+  options.extract_selection_mask_feature = true;
+  options.allowed_chargrams.insert("^H");
+  options.allowed_chargrams.insert("ll");
+  options.allowed_chargrams.insert("llo");
+  options.allowed_chargrams.insert("w");
+  options.allowed_chargrams.insert("!");
+  options.allowed_chargrams.insert("\xc4");  // UTF8 control character.
+
+  const UniLib unilib;
+  TestingTokenFeatureExtractor extractor(options, unilib);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+
+  extractor.Extract(Token{"Hěllo", 0, 5}, true, &sparse_features,
+                    &dense_features);
+
+  EXPECT_THAT(sparse_features,
+              testing::ElementsAreArray({
+                  // clang-format off
+                  0,
+                  extractor.HashToken("\xc4"),
+                  0,
+                  0,
+                  0,
+                  0,
+                  extractor.HashToken("^H"),
+                  0,
+                  0,
+                  0,
+                  extractor.HashToken("ll"),
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                  extractor.HashToken("llo"),
+                  0
+                  // clang-format on
+              }));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({1.0, 1.0}));
+
+  sparse_features.clear();
+  dense_features.clear();
+  extractor.Extract(Token{"world!", 23, 29}, false, &sparse_features,
+                    &dense_features);
+
+  EXPECT_THAT(sparse_features, testing::ElementsAreArray({
+                                   // clang-format off
+                  extractor.HashToken("w"),
+                  0,
+                  0,
+                  0,
+                  0,
+                  extractor.HashToken("!"),
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                  0,
+                                   // clang-format on
+                               }));
+  EXPECT_THAT(dense_features, testing::ElementsAreArray({-1.0, 0.0}));
+  EXPECT_EQ(extractor.HashToken("<PAD>"), 1);
+}
+
+}  // namespace
+}  // namespace libtextclassifier2