blob: ecdad160e00a3481230951429ab9ca7ca3b70125 [file] [log] [blame]
/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "smartselect/feature-processor.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace libtextclassifier {
namespace {
using testing::ElementsAreArray;
TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
std::vector<Token> tokens{Token("Hělló", 0, 5, false),
Token("fěěbař@google.com", 6, 23, false),
Token("heře!", 24, 29, false)};
internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
// clang-format off
EXPECT_THAT(tokens, ElementsAreArray(
{Token("Hělló", 0, 5, false),
Token("fěě", 6, 9, false),
Token("bař", 9, 12, false),
Token("@google.com", 12, 23, false),
Token("heře!", 24, 29, false)}));
// clang-format on
}
TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
std::vector<Token> tokens{Token("Hělló", 0, 5, false),
Token("fěěbař@google.com", 6, 23, false),
Token("heře!", 24, 29, false)};
internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
// clang-format off
EXPECT_THAT(tokens, ElementsAreArray(
{Token("Hělló", 0, 5, false),
Token("fěěbař", 6, 12, false),
Token("@google.com", 12, 23, false),
Token("heře!", 24, 29, false)}));
// clang-format on
}
TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
std::vector<Token> tokens{Token("Hělló", 0, 5, false),
Token("fěěbař@google.com", 6, 23, false),
Token("heře!", 24, 29, false)};
internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
// clang-format off
EXPECT_THAT(tokens, ElementsAreArray(
{Token("Hělló", 0, 5, false),
Token("fěě", 6, 9, false),
Token("bař@google.com", 9, 23, false),
Token("heře!", 24, 29, false)}));
// clang-format on
}
TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
std::vector<Token> tokens{Token("Hělló", 0, 5, false),
Token("fěěbař@google.com", 6, 23, false),
Token("heře!", 24, 29, false)};
internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
// clang-format off
EXPECT_THAT(tokens, ElementsAreArray(
{Token("Hělló", 0, 5, false),
Token("fěěbař@google.com", 6, 23, false),
Token("heře!", 24, 29, false)}));
// clang-format on
}
TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
std::vector<Token> tokens{Token("Hělló", 0, 5, false),
Token("fěěbař@google.com", 6, 23, false),
Token("heře!", 24, 29, false)};
internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
// clang-format off
EXPECT_THAT(tokens, ElementsAreArray(
{Token("Hě", 0, 2, false),
Token("lló", 2, 5, false),
Token("fěě", 6, 9, false),
Token("bař@google.com", 9, 23, false),
Token("heře!", 24, 29, false)}));
// clang-format on
}
TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
SelectionWithContext selection;
selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
// Keeps the first line.
selection.click_start = 0;
selection.click_end = 5;
selection.selection_start = 6;
selection.selection_end = 10;
SelectionWithContext line_selection;
int shift;
std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
EXPECT_EQ(line_selection.context, "Fiřst Lině");
EXPECT_EQ(line_selection.click_start, 0);
EXPECT_EQ(line_selection.click_end, 5);
EXPECT_EQ(line_selection.selection_start, 6);
EXPECT_EQ(line_selection.selection_end, 10);
EXPECT_EQ(shift, 0);
}
TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
SelectionWithContext selection;
selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
// Keeps the second line.
selection.click_start = 11;
selection.click_end = 17;
selection.selection_start = 18;
selection.selection_end = 22;
SelectionWithContext line_selection;
int shift;
std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
EXPECT_EQ(line_selection.context, "Sěcond Lině");
EXPECT_EQ(line_selection.click_start, 0);
EXPECT_EQ(line_selection.click_end, 6);
EXPECT_EQ(line_selection.selection_start, 7);
EXPECT_EQ(line_selection.selection_end, 11);
EXPECT_EQ(shift, 11);
}
TEST(FeatureProcessorTest, KeepLineWithClickThird) {
SelectionWithContext selection;
selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
// Keeps the third line.
selection.click_start = 29;
selection.click_end = 33;
selection.selection_start = 23;
selection.selection_end = 28;
SelectionWithContext line_selection;
int shift;
std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
EXPECT_EQ(line_selection.context, "Thiřd Lině");
EXPECT_EQ(line_selection.click_start, 6);
EXPECT_EQ(line_selection.click_end, 10);
EXPECT_EQ(line_selection.selection_start, 0);
EXPECT_EQ(line_selection.selection_end, 5);
EXPECT_EQ(shift, 23);
}
TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
SelectionWithContext selection;
selection.context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
// Keeps the second line.
selection.click_start = 11;
selection.click_end = 17;
selection.selection_start = 18;
selection.selection_end = 22;
SelectionWithContext line_selection;
int shift;
std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
EXPECT_EQ(line_selection.context, "Sěcond Lině");
EXPECT_EQ(line_selection.click_start, 0);
EXPECT_EQ(line_selection.click_end, 6);
EXPECT_EQ(line_selection.selection_start, 7);
EXPECT_EQ(line_selection.selection_end, 11);
EXPECT_EQ(shift, 11);
}
TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
SelectionWithContext selection;
selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
// Selects across lines, so KeepLine should not do any changes.
selection.click_start = 6;
selection.click_end = 17;
selection.selection_start = 0;
selection.selection_end = 22;
SelectionWithContext line_selection;
int shift;
std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
EXPECT_EQ(line_selection.context, "Fiřst Lině\nSěcond Lině\nThiřd Lině");
EXPECT_EQ(line_selection.click_start, 6);
EXPECT_EQ(line_selection.click_end, 17);
EXPECT_EQ(line_selection.selection_start, 0);
EXPECT_EQ(line_selection.selection_end, 22);
EXPECT_EQ(shift, 0);
}
TEST(FeatureProcessorTest, GetFeaturesWithContextDropout) {
FeatureProcessorOptions options;
options.set_num_buckets(10);
options.set_context_size(7);
options.set_max_selection_span(7);
options.add_chargram_orders(1);
options.set_tokenize_on_space(true);
options.set_context_dropout_probability(0.5);
options.set_use_variable_context_dropout(true);
TokenizationCodepointRange* config =
options.add_tokenization_codepoint_config();
config->set_start(32);
config->set_end(33);
config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
FeatureProcessor feature_processor(options);
SelectionWithContext selection_with_context;
selection_with_context.context = "1 2 3 c o n t e x t X c o n t e x t 1 2 3";
// Selection and click indices of the X in the middle:
selection_with_context.selection_start = 20;
selection_with_context.selection_end = 21;
selection_with_context.click_start = 20;
selection_with_context.click_end = 21;
// Test that two subsequent runs with random context dropout produce
// different features.
feature_processor.SetRandom(new std::mt19937);
std::vector<std::vector<std::pair<int, float>>> features;
std::vector<std::vector<std::pair<int, float>>> features2;
std::vector<float> extra_features;
std::vector<CodepointSpan> selection_label_spans;
int selection_label;
CodepointSpan selection_codepoint_label;
int classification_label;
EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
selection_with_context, &features, &extra_features,
&selection_label_spans, &selection_label, &selection_codepoint_label,
&classification_label));
EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
selection_with_context, &features2, &extra_features,
&selection_label_spans, &selection_label, &selection_codepoint_label,
&classification_label));
EXPECT_NE(features, features2);
}
TEST(FeatureProcessorTest, GetFeaturesWithLongerContext) {
FeatureProcessorOptions options;
options.set_num_buckets(10);
options.set_context_size(9);
options.set_max_selection_span(7);
options.add_chargram_orders(1);
options.set_tokenize_on_space(true);
TokenizationCodepointRange* config =
options.add_tokenization_codepoint_config();
config->set_start(32);
config->set_end(33);
config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
FeatureProcessor feature_processor(options);
SelectionWithContext selection_with_context;
selection_with_context.context = "1 2 3 c o n t e x t X c o n t e x t 1 2 3";
// Selection and click indices of the X in the middle:
selection_with_context.selection_start = 20;
selection_with_context.selection_end = 21;
selection_with_context.click_start = 20;
selection_with_context.click_end = 21;
std::vector<std::vector<std::pair<int, float>>> features;
std::vector<float> extra_features;
std::vector<CodepointSpan> selection_label_spans;
int selection_label;
CodepointSpan selection_codepoint_label;
int classification_label;
EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
selection_with_context, &features, &extra_features,
&selection_label_spans, &selection_label, &selection_codepoint_label,
&classification_label));
EXPECT_EQ(19, features.size());
// Should pad the string.
selection_with_context.context = "X";
selection_with_context.selection_start = 0;
selection_with_context.selection_end = 1;
selection_with_context.click_start = 0;
selection_with_context.click_end = 1;
EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
selection_with_context, &features, &extra_features,
&selection_label_spans, &selection_label, &selection_codepoint_label,
&classification_label));
EXPECT_EQ(19, features.size());
}
class TestingFeatureProcessor : public FeatureProcessor {
public:
using FeatureProcessor::FeatureProcessor;
using FeatureProcessor::FindTokensInSelection;
};
TEST(FeatureProcessorTest, FindTokensInSelectionSingleCharacter) {
FeatureProcessorOptions options;
options.set_num_buckets(10);
options.set_context_size(9);
options.set_max_selection_span(7);
options.add_chargram_orders(1);
options.set_tokenize_on_space(true);
TokenizationCodepointRange* config =
options.add_tokenization_codepoint_config();
config->set_start(32);
config->set_end(33);
config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
TestingFeatureProcessor feature_processor(options);
SelectionWithContext selection_with_context;
selection_with_context.context = "1 2 3 c o n t e x t X c o n t e x t 1 2 3";
// Selection and click indices of the X in the middle:
selection_with_context.selection_start = 20;
selection_with_context.selection_end = 21;
// clang-format off
EXPECT_THAT(feature_processor.FindTokensInSelection(
feature_processor.Tokenize(selection_with_context.context),
selection_with_context),
ElementsAreArray({Token("X", 20, 21, false)}));
// clang-format on
}
TEST(FeatureProcessorTest, FindTokensInSelectionInsideTokenBoundary) {
FeatureProcessorOptions options;
options.set_num_buckets(10);
options.set_context_size(9);
options.set_max_selection_span(7);
options.add_chargram_orders(1);
options.set_tokenize_on_space(true);
TokenizationCodepointRange* config =
options.add_tokenization_codepoint_config();
config->set_start(32);
config->set_end(33);
config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
TestingFeatureProcessor feature_processor(options);
SelectionWithContext selection_with_context;
selection_with_context.context = "I live at 350 Third Street, today.";
const std::vector<Token> expected_selection = {
// clang-format off
Token("350", 10, 13, false),
Token("Third", 14, 19, false),
Token("Street,", 20, 27, false),
// clang-format on
};
// Selection: I live at {350 Third Str}eet, today.
selection_with_context.selection_start = 10;
selection_with_context.selection_end = 23;
EXPECT_THAT(feature_processor.FindTokensInSelection(
feature_processor.Tokenize(selection_with_context.context),
selection_with_context),
ElementsAreArray(expected_selection));
// Selection: I live at {350 Third Street,} today.
selection_with_context.selection_start = 10;
selection_with_context.selection_end = 27;
EXPECT_THAT(feature_processor.FindTokensInSelection(
feature_processor.Tokenize(selection_with_context.context),
selection_with_context),
ElementsAreArray(expected_selection));
// Selection: I live at {350 Third Street, }today.
selection_with_context.selection_start = 10;
selection_with_context.selection_end = 28;
EXPECT_THAT(feature_processor.FindTokensInSelection(
feature_processor.Tokenize(selection_with_context.context),
selection_with_context),
ElementsAreArray(expected_selection));
// Selection: I live at {350 Third S}treet, today.
selection_with_context.selection_start = 10;
selection_with_context.selection_end = 21;
EXPECT_THAT(feature_processor.FindTokensInSelection(
feature_processor.Tokenize(selection_with_context.context),
selection_with_context),
ElementsAreArray(expected_selection));
// Test that when crossing the boundary, we select less/more.
// Selection: I live at {350 Third} Street, today.
selection_with_context.selection_start = 10;
selection_with_context.selection_end = 19;
EXPECT_THAT(feature_processor.FindTokensInSelection(
feature_processor.Tokenize(selection_with_context.context),
selection_with_context),
ElementsAreArray({
// clang-format off
Token("350", 10, 13, false),
Token("Third", 14, 19, false),
// clang-format on
}));
// Selection: I live at {350 Third Street, t}oday.
selection_with_context.selection_start = 10;
selection_with_context.selection_end = 29;
EXPECT_THAT(
feature_processor.FindTokensInSelection(
feature_processor.Tokenize(selection_with_context.context),
selection_with_context),
ElementsAreArray({
// clang-format off
Token("350", 10, 13, false),
Token("Third", 14, 19, false),
Token("Street,", 20, 27, false),
Token("today.", 28, 34, false),
// clang-format on
}));
}
} // namespace
} // namespace libtextclassifier