Import libtextclassifier changes from google3.
This includes an upgrade to the latest version of the model.
We now use a test data path constant in the unit tests. This is
consistent with another test project, minikin_tests.
Test: Tests pass on-device.
Bug: 34865247
Change-Id: Ia061888a0bba371c6b429ad7c5457af611ba12f2
diff --git a/tests/feature-processor_test.cc b/tests/feature-processor_test.cc
index ecdad16..652db84 100644
--- a/tests/feature-processor_test.cc
+++ b/tests/feature-processor_test.cc
@@ -106,113 +106,95 @@
}
TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
- SelectionWithContext selection;
- selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+ const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+ const CodepointSpan span = {0, 5};
+ // clang-format off
+ std::vector<Token> tokens = {Token("Fiřst", 0, 5),
+ Token("Lině", 6, 10),
+ Token("Sěcond", 11, 17),
+ Token("Lině", 18, 22),
+ Token("Thiřd", 23, 28),
+ Token("Lině", 29, 33)};
+ // clang-format on
// Keeps the first line.
- selection.click_start = 0;
- selection.click_end = 5;
- selection.selection_start = 6;
- selection.selection_end = 10;
-
- SelectionWithContext line_selection;
- int shift;
- std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
-
- EXPECT_EQ(line_selection.context, "Fiřst Lině");
- EXPECT_EQ(line_selection.click_start, 0);
- EXPECT_EQ(line_selection.click_end, 5);
- EXPECT_EQ(line_selection.selection_start, 6);
- EXPECT_EQ(line_selection.selection_end, 10);
- EXPECT_EQ(shift, 0);
+ internal::StripTokensFromOtherLines(context, span, &tokens);
+ EXPECT_THAT(tokens,
+ ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
}
TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
- SelectionWithContext selection;
- selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+ const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+ const CodepointSpan span = {18, 22};
+ // clang-format off
+ std::vector<Token> tokens = {Token("Fiřst", 0, 5),
+ Token("Lině", 6, 10),
+ Token("Sěcond", 11, 17),
+ Token("Lině", 18, 22),
+ Token("Thiřd", 23, 28),
+ Token("Lině", 29, 33)};
+ // clang-format on
- // Keeps the second line.
- selection.click_start = 11;
- selection.click_end = 17;
- selection.selection_start = 18;
- selection.selection_end = 22;
-
- SelectionWithContext line_selection;
- int shift;
- std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
-
- EXPECT_EQ(line_selection.context, "Sěcond Lině");
- EXPECT_EQ(line_selection.click_start, 0);
- EXPECT_EQ(line_selection.click_end, 6);
- EXPECT_EQ(line_selection.selection_start, 7);
- EXPECT_EQ(line_selection.selection_end, 11);
- EXPECT_EQ(shift, 11);
+ // Keeps the first line.
+ internal::StripTokensFromOtherLines(context, span, &tokens);
+ EXPECT_THAT(tokens, ElementsAreArray(
+ {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
}
TEST(FeatureProcessorTest, KeepLineWithClickThird) {
- SelectionWithContext selection;
- selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+ const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+ const CodepointSpan span = {24, 33};
+ // clang-format off
+ std::vector<Token> tokens = {Token("Fiřst", 0, 5),
+ Token("Lině", 6, 10),
+ Token("Sěcond", 11, 17),
+ Token("Lině", 18, 22),
+ Token("Thiřd", 23, 28),
+ Token("Lině", 29, 33)};
+ // clang-format on
- // Keeps the third line.
- selection.click_start = 29;
- selection.click_end = 33;
- selection.selection_start = 23;
- selection.selection_end = 28;
-
- SelectionWithContext line_selection;
- int shift;
- std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
-
- EXPECT_EQ(line_selection.context, "Thiřd Lině");
- EXPECT_EQ(line_selection.click_start, 6);
- EXPECT_EQ(line_selection.click_end, 10);
- EXPECT_EQ(line_selection.selection_start, 0);
- EXPECT_EQ(line_selection.selection_end, 5);
- EXPECT_EQ(shift, 23);
+ // Keeps the first line.
+ internal::StripTokensFromOtherLines(context, span, &tokens);
+ EXPECT_THAT(tokens, ElementsAreArray(
+ {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
}
TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
- SelectionWithContext selection;
- selection.context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
+ const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
+ const CodepointSpan span = {18, 22};
+ // clang-format off
+ std::vector<Token> tokens = {Token("Fiřst", 0, 5),
+ Token("Lině", 6, 10),
+ Token("Sěcond", 11, 17),
+ Token("Lině", 18, 22),
+ Token("Thiřd", 23, 28),
+ Token("Lině", 29, 33)};
+ // clang-format on
- // Keeps the second line.
- selection.click_start = 11;
- selection.click_end = 17;
- selection.selection_start = 18;
- selection.selection_end = 22;
-
- SelectionWithContext line_selection;
- int shift;
- std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
-
- EXPECT_EQ(line_selection.context, "Sěcond Lině");
- EXPECT_EQ(line_selection.click_start, 0);
- EXPECT_EQ(line_selection.click_end, 6);
- EXPECT_EQ(line_selection.selection_start, 7);
- EXPECT_EQ(line_selection.selection_end, 11);
- EXPECT_EQ(shift, 11);
+ // Keeps the first line.
+ internal::StripTokensFromOtherLines(context, span, &tokens);
+ EXPECT_THAT(tokens, ElementsAreArray(
+ {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
}
TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
- SelectionWithContext selection;
- selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+ const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+ const CodepointSpan span = {5, 23};
+ // clang-format off
+ std::vector<Token> tokens = {Token("Fiřst", 0, 5),
+ Token("Lině", 6, 10),
+ Token("Sěcond", 18, 23),
+ Token("Lině", 19, 23),
+ Token("Thiřd", 23, 28),
+ Token("Lině", 29, 33)};
+ // clang-format on
- // Selects across lines, so KeepLine should not do any changes.
- selection.click_start = 6;
- selection.click_end = 17;
- selection.selection_start = 0;
- selection.selection_end = 22;
-
- SelectionWithContext line_selection;
- int shift;
- std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
-
- EXPECT_EQ(line_selection.context, "Fiřst Lině\nSěcond Lině\nThiřd Lině");
- EXPECT_EQ(line_selection.click_start, 6);
- EXPECT_EQ(line_selection.click_end, 17);
- EXPECT_EQ(line_selection.selection_start, 0);
- EXPECT_EQ(line_selection.selection_end, 22);
- EXPECT_EQ(shift, 0);
+ // Keeps the first line.
+ internal::StripTokensFromOtherLines(context, span, &tokens);
+ EXPECT_THAT(tokens, ElementsAreArray(
+ {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
+ Token("Sěcond", 18, 23), Token("Lině", 19, 23),
+ Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
}
TEST(FeatureProcessorTest, GetFeaturesWithContextDropout) {
@@ -231,14 +213,6 @@
config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
FeatureProcessor feature_processor(options);
- SelectionWithContext selection_with_context;
- selection_with_context.context = "1 2 3 c o n t e x t X c o n t e x t 1 2 3";
- // Selection and click indices of the X in the middle:
- selection_with_context.selection_start = 20;
- selection_with_context.selection_end = 21;
- selection_with_context.click_start = 20;
- selection_with_context.click_end = 21;
-
// Test that two subsequent runs with random context dropout produce
// different features.
feature_processor.SetRandom(new std::mt19937);
@@ -251,13 +225,13 @@
CodepointSpan selection_codepoint_label;
int classification_label;
EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
- selection_with_context, &features, &extra_features,
- &selection_label_spans, &selection_label, &selection_codepoint_label,
- &classification_label));
+ "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
+ &features, &extra_features, &selection_label_spans, &selection_label,
+ &selection_codepoint_label, &classification_label));
EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
- selection_with_context, &features2, &extra_features,
- &selection_label_spans, &selection_label, &selection_codepoint_label,
- &classification_label));
+ "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
+ &features2, &extra_features, &selection_label_spans, &selection_label,
+ &selection_codepoint_label, &classification_label));
EXPECT_NE(features, features2);
}
@@ -276,14 +250,6 @@
config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
FeatureProcessor feature_processor(options);
- SelectionWithContext selection_with_context;
- selection_with_context.context = "1 2 3 c o n t e x t X c o n t e x t 1 2 3";
- // Selection and click indices of the X in the middle:
- selection_with_context.selection_start = 20;
- selection_with_context.selection_end = 21;
- selection_with_context.click_start = 20;
- selection_with_context.click_end = 21;
-
std::vector<std::vector<std::pair<int, float>>> features;
std::vector<float> extra_features;
std::vector<CodepointSpan> selection_label_spans;
@@ -291,19 +257,14 @@
CodepointSpan selection_codepoint_label;
int classification_label;
EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
- selection_with_context, &features, &extra_features,
- &selection_label_spans, &selection_label, &selection_codepoint_label,
- &classification_label));
+ "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
+ &features, &extra_features, &selection_label_spans, &selection_label,
+ &selection_codepoint_label, &classification_label));
EXPECT_EQ(19, features.size());
// Should pad the string.
- selection_with_context.context = "X";
- selection_with_context.selection_start = 0;
- selection_with_context.selection_end = 1;
- selection_with_context.click_start = 0;
- selection_with_context.click_end = 1;
EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
- selection_with_context, &features, &extra_features,
+ "X", {0, 1}, {0, 1}, "", &features, &extra_features,
&selection_label_spans, &selection_label, &selection_codepoint_label,
&classification_label));
EXPECT_EQ(19, features.size());
@@ -432,5 +393,67 @@
}));
}
+TEST(FeatureProcessorTest, CenterTokenFromClick) {
+ int token_index;
+
+ // Exactly aligned indices.
+ token_index = internal::CenterTokenFromClick(
+ {6, 11}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
+ Token("heře!", 12, 17, false)});
+ EXPECT_EQ(token_index, 1);
+
+ // Click is contained in a token.
+ token_index = internal::CenterTokenFromClick(
+ {13, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
+ Token("heře!", 12, 17, false)});
+ EXPECT_EQ(token_index, 2);
+
+ // Click spans two tokens.
+ token_index = internal::CenterTokenFromClick(
+ {6, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
+ Token("heře!", 12, 17, false)});
+ EXPECT_EQ(token_index, kInvalidIndex);
+}
+
+TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
+ SelectionWithContext selection;
+ int token_index;
+
+ // Selection of length 3. Exactly aligned indices.
+ token_index = internal::CenterTokenFromMiddleOfSelection(
+ {7, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
+ Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
+ Token("Token5", 28, 34, false)});
+ EXPECT_EQ(token_index, 2);
+
+ // Selection of length 1 token. Exactly aligned indices.
+ token_index = internal::CenterTokenFromMiddleOfSelection(
+ {21, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
+ Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
+ Token("Token5", 28, 34, false)});
+ EXPECT_EQ(token_index, 3);
+
+ // Selection marks sub-token range, with no tokens in it.
+ token_index = internal::CenterTokenFromMiddleOfSelection(
+ {29, 33}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
+ Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
+ Token("Token5", 28, 34, false)});
+ EXPECT_EQ(token_index, kInvalidIndex);
+
+ // Selection of length 2. Sub-token indices.
+ token_index = internal::CenterTokenFromMiddleOfSelection(
+ {3, 25}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
+ Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
+ Token("Token5", 28, 34, false)});
+ EXPECT_EQ(token_index, 1);
+
+ // Selection of length 1. Sub-token indices.
+ token_index = internal::CenterTokenFromMiddleOfSelection(
+ {22, 34}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
+ Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
+ Token("Token5", 28, 34, false)});
+ EXPECT_EQ(token_index, 4);
+}
+
} // namespace
} // namespace libtextclassifier
diff --git a/tests/lang-id_test.cc b/tests/lang-id_test.cc
index 39aed63..faf78c6 100644
--- a/tests/lang-id_test.cc
+++ b/tests/lang-id_test.cc
@@ -30,7 +30,7 @@
namespace {
std::string GetModelPath() {
- return "tests/testdata/langid.model";
+ return TEST_DATA_DIR "langid.model";
}
// Creates a LangId with default model. Passes ownership to
diff --git a/tests/testdata/smartselection.model b/tests/testdata/smartselection.model
index 8972a2e..2b96f42 100644
--- a/tests/testdata/smartselection.model
+++ b/tests/testdata/smartselection.model
Binary files differ
diff --git a/tests/text-classification-model_test.cc b/tests/text-classification-model_test.cc
index a4545cd..d588fc7 100644
--- a/tests/text-classification-model_test.cc
+++ b/tests/text-classification-model_test.cc
@@ -28,28 +28,30 @@
namespace {
std::string GetModelPath() {
- return "tests/testdata/smartselection.model";
+ return TEST_DATA_DIR "smartselection.model";
}
TEST(TextClassificationModelTest, SuggestSelection) {
const std::string model_path = GetModelPath();
int fd = open(model_path.c_str(), O_RDONLY);
- std::unique_ptr<TextClassificationModel> ff_model(
+ std::unique_ptr<TextClassificationModel> model(
new TextClassificationModel(fd));
close(fd);
- std::tuple<int, int> selection;
- selection = ff_model->SuggestSelection(
- "this afternoon Barack Obama gave a speech at", {15, 21});
- EXPECT_EQ(15, std::get<0>(selection));
- EXPECT_EQ(27, std::get<1>(selection));
+ EXPECT_EQ(model->SuggestSelection(
+ "this afternoon Barack Obama gave a speech at", {15, 21}),
+ std::make_pair(15, 27));
// Try passing whole string.
- selection =
- ff_model->SuggestSelection("350 Third Street, Cambridge", {0, 27});
// If more than 1 token is specified, we should return back what entered.
- EXPECT_EQ(0, std::get<0>(selection));
- EXPECT_EQ(27, std::get<1>(selection));
+ EXPECT_EQ(model->SuggestSelection("350 Third Street, Cambridge", {0, 27}),
+ std::make_pair(0, 27));
+
+ // Single letter.
+ EXPECT_EQ(std::make_pair(0, 1), model->SuggestSelection("a", {0, 1}));
+
+ // Single word.
+ EXPECT_EQ(std::make_pair(0, 4), model->SuggestSelection("asdf", {0, 4}));
}
TEST(TextClassificationModelTest, SuggestSelectionsAreSymmetric) {
@@ -183,6 +185,10 @@
namespace {
std::string FindBestResult(std::vector<std::pair<std::string, float>> results) {
+ if (results.empty()) {
+ return "<INVALID RESULTS>";
+ }
+
std::sort(results.begin(), results.end(),
[](const std::pair<std::string, float> a,
const std::pair<std::string, float> b) {
@@ -211,6 +217,29 @@
"Call me at (800) 123-456 today", {11, 24})));
EXPECT_EQ("url", FindBestResult(model->ClassifyText(
"Visit www.google.com every today!", {6, 20})));
+
+ // More lines.
+ EXPECT_EQ("other",
+ FindBestResult(model->ClassifyText(
+ "this afternoon Barack Obama gave a speech at|Visit "
+ "www.google.com every today!|Call me at (800) 123-456 today.",
+ {15, 27})));
+ EXPECT_EQ("url",
+ FindBestResult(model->ClassifyText(
+ "this afternoon Barack Obama gave a speech at|Visit "
+ "www.google.com every today!|Call me at (800) 123-456 today.",
+ {51, 65})));
+ EXPECT_EQ("phone",
+ FindBestResult(model->ClassifyText(
+ "this afternoon Barack Obama gave a speech at|Visit "
+ "www.google.com every today!|Call me at (800) 123-456 today.",
+ {90, 103})));
+
+ // Single word.
+ EXPECT_EQ("other", FindBestResult(model->ClassifyText("Obama", {0, 5})));
+ EXPECT_EQ("other", FindBestResult(model->ClassifyText("asdf", {0, 4})));
+ EXPECT_EQ("<INVALID RESULTS>",
+ FindBestResult(model->ClassifyText("asdf", {0, 0})));
}
} // namespace