Import latest version of libtextclassifier.
This includes newly trained i18n models.
The inference code now includes the option to normalize all input
to lowercase.
Bug: 36886059
Bug: 37534119
Test: Unit tests pass.
Change-Id: I28d1bd2241720f720d2dcabfb5710748a311b302
diff --git a/tests/testdata/smartselection.model b/tests/testdata/smartselection.model
index 850033a..92dd58b 100644
--- a/tests/testdata/smartselection.model
+++ b/tests/testdata/smartselection.model
Binary files differ
diff --git a/tests/text-classification-model_test.cc b/tests/text-classification-model_test.cc
index cac093d..ed00876 100644
--- a/tests/text-classification-model_test.cc
+++ b/tests/text-classification-model_test.cc
@@ -267,8 +267,7 @@
{90, 103})));
// Single word.
- EXPECT_EQ("other",
- FindBestResult(model->ClassifyText("Barack Obama", {0, 12})));
+ EXPECT_EQ("other", FindBestResult(model->ClassifyText("obama", {0, 5})));
EXPECT_EQ("other", FindBestResult(model->ClassifyText("asdf", {0, 4})));
EXPECT_EQ("<INVALID RESULTS>",
FindBestResult(model->ClassifyText("asdf", {0, 0})));
diff --git a/tests/token-feature-extractor_test.cc b/tests/token-feature-extractor_test.cc
index 277549e..c85ba50 100644
--- a/tests/token-feature-extractor_test.cc
+++ b/tests/token-feature-extractor_test.cc
@@ -250,6 +250,47 @@
testing::Not(testing::ElementsAreArray(sparse_features2)));
}
+TEST(TokenFeatureExtractorTest, LowercaseAscii) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2};
+ options.lowercase_tokens = true;
+ options.unicode_aware_features = false;
+ TokenFeatureExtractor extractor(options);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+ extractor.Extract(Token{"AABB", 0, 6}, true, &sparse_features,
+ &dense_features);
+
+ std::vector<int> sparse_features2;
+ extractor.Extract(Token{"aaBB", 0, 6}, true, &sparse_features2,
+ &dense_features);
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+
+ extractor.Extract(Token{"aAbB", 0, 6}, true, &sparse_features2,
+ &dense_features);
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+}
+
+TEST(TokenFeatureExtractorTest, LowercaseUnicode) {
+ TokenFeatureExtractorOptions options;
+ options.num_buckets = 1000;
+ options.chargram_orders = std::vector<int>{1, 2};
+ options.lowercase_tokens = true;
+ options.unicode_aware_features = true;
+ TokenFeatureExtractor extractor(options);
+
+ std::vector<int> sparse_features;
+ std::vector<float> dense_features;
+ extractor.Extract(Token{"ŘŘ", 0, 6}, true, &sparse_features, &dense_features);
+
+ std::vector<int> sparse_features2;
+ extractor.Extract(Token{"řř", 0, 6}, true, &sparse_features2,
+ &dense_features);
+ EXPECT_THAT(sparse_features, testing::ElementsAreArray(sparse_features2));
+}
+
TEST(TokenFeatureExtractorTest, RegexFeatures) {
TokenFeatureExtractorOptions options;
options.num_buckets = 1000;