Import latest version of libtextclassifier. This includes newly trained i18n models. The inference code now includes the option to normalize all input to lowercase. Bug: 36886059 Bug: 37534119 Test: Unit tests pass. Change-Id: I28d1bd2241720f720d2dcabfb5710748a311b302

commit: deb722d4cfe3bff7eadaeaa032c6d4ae5ce80e0b [log] [tgz]
author: Matt Sharifi <mns@google.com> Mon Apr 24 13:30:47 2017 +0200
committer: Matt Sharifi <mns@google.com> Mon Apr 24 13:30:47 2017 +0200
tree: 8b9e0243ef7462fcc7085c4b2d9446f457b8003c
parent: 91f068673f5915dd4c12fac026e4a043e4a063a0 [diff]
diff --git a/tests/testdata/smartselection.model b/tests/testdata/smartselection.model
index 850033a..92dd58b 100644
--- a/tests/testdata/smartselection.model
+++ b/tests/testdata/smartselection.model
Binary files differ

diff --git a/tests/text-classification-model_test.cc b/tests/text-classification-model_test.cc
index cac093d..ed00876 100644
--- a/tests/text-classification-model_test.cc
+++ b/tests/text-classification-model_test.cc

@@ -267,8 +267,7 @@
                 {90, 103})));
 
   // Single word.
-  EXPECT_EQ("other",
-            FindBestResult(model->ClassifyText("Barack Obama", {0, 12})));
+  EXPECT_EQ("other", FindBestResult(model->ClassifyText("obama", {0, 5})));
   EXPECT_EQ("other", FindBestResult(model->ClassifyText("asdf", {0, 4})));
   EXPECT_EQ("<INVALID RESULTS>",
             FindBestResult(model->ClassifyText("asdf", {0, 0})));

diff --git a/tests/token-feature-extractor_test.cc b/tests/token-feature-extractor_test.cc
index 277549e..c85ba50 100644
--- a/tests/token-feature-extractor_test.cc
+++ b/tests/token-feature-extractor_test.cc

@@ -250,6 +250,47 @@
               testing::Not(testing::ElementsAreArray(sparse_features2)));
 }
 
+TEST(TokenFeatureExtractorTest, LowercaseAscii) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2};
+  options.lowercase_tokens = true;
+  options.unicode_aware_features = false;
+  TokenFeatureExtractor extractor(options);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+  extractor.Extract(Token{"AABB", 0, 6}, true, &sparse_features,
+                    &dense_features);
+
+  std::vector<int> sparse_features2;
+  extractor.Extract(Token{"aaBB", 0, 6}, true, &sparse_features2,
+                    &dense_features);
+  EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+
+  extractor.Extract(Token{"aAbB", 0, 6}, true, &sparse_features2,
+                    &dense_features);
+  EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+}
+
+TEST(TokenFeatureExtractorTest, LowercaseUnicode) {
+  TokenFeatureExtractorOptions options;
+  options.num_buckets = 1000;
+  options.chargram_orders = std::vector<int>{1, 2};
+  options.lowercase_tokens = true;
+  options.unicode_aware_features = true;
+  TokenFeatureExtractor extractor(options);
+
+  std::vector<int> sparse_features;
+  std::vector<float> dense_features;
+  extractor.Extract(Token{"ŘŘ", 0, 6}, true, &sparse_features, &dense_features);
+
+  std::vector<int> sparse_features2;
+  extractor.Extract(Token{"řř", 0, 6}, true, &sparse_features2,
+                    &dense_features);
+  EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+}
+
 TEST(TokenFeatureExtractorTest, RegexFeatures) {
   TokenFeatureExtractorOptions options;
   options.num_buckets = 1000;
commit	deb722d4cfe3bff7eadaeaa032c6d4ae5ce80e0b	[log] [tgz]
author	Matt Sharifi <mns@google.com>	Mon Apr 24 13:30:47 2017 +0200
committer	Matt Sharifi <mns@google.com>	Mon Apr 24 13:30:47 2017 +0200
tree	8b9e0243ef7462fcc7085c4b2d9446f457b8003c
parent	91f068673f5915dd4c12fac026e4a043e4a063a0 [diff]