blob: fbaf039b736e54bcfa78ed500eec888d99dbd588 [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "annotator/annotator.h"
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include "annotator/model_generated.h"
#include "annotator/types-test-util.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace libtextclassifier3 {
namespace {
using testing::ElementsAreArray;
using testing::IsEmpty;
using testing::Pair;
using testing::Values;
std::string FirstResult(const std::vector<ClassificationResult>& results) {
if (results.empty()) {
return "<INVALID RESULTS>";
}
return results[0].collection;
}
MATCHER_P3(IsAnnotatedSpan, start, end, best_class, "") {
return testing::Value(arg.span, Pair(start, end)) &&
testing::Value(FirstResult(arg.classification), best_class);
}
std::string ReadFile(const std::string& file_name) {
std::ifstream file_stream(file_name);
return std::string(std::istreambuf_iterator<char>(file_stream), {});
}
std::string GetModelPath() {
return TC3_TEST_DATA_DIR;
}
class AnnotatorTest : public ::testing::TestWithParam<const char*> {
protected:
AnnotatorTest()
: INIT_UNILIB_FOR_TESTING(unilib_),
INIT_CALENDARLIB_FOR_TESTING(calendarlib_) {}
UniLib unilib_;
CalendarLib calendarlib_;
};
TEST_F(AnnotatorTest, EmbeddingExecutorLoadingFails) {
std::unique_ptr<Annotator> classifier = Annotator::FromPath(
GetModelPath() + "wrong_embeddings.fb", &unilib_, &calendarlib_);
EXPECT_FALSE(classifier);
}
INSTANTIATE_TEST_CASE_P(ClickContext, AnnotatorTest,
Values("test_model_cc.fb"));
INSTANTIATE_TEST_CASE_P(BoundsSensitive, AnnotatorTest,
Values("test_model.fb"));
TEST_P(AnnotatorTest, ClassifyText) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ("other",
FirstResult(classifier->ClassifyText(
"this afternoon Barack Obama gave a speech at", {15, 27})));
EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
"Call me at (800) 123-456 today", {11, 24})));
// More lines.
EXPECT_EQ("other",
FirstResult(classifier->ClassifyText(
"this afternoon Barack Obama gave a speech at|Visit "
"www.google.com every today!|Call me at (800) 123-456 today.",
{15, 27})));
EXPECT_EQ("phone",
FirstResult(classifier->ClassifyText(
"this afternoon Barack Obama gave a speech at|Visit "
"www.google.com every today!|Call me at (800) 123-456 today.",
{90, 103})));
// Single word.
EXPECT_EQ("other", FirstResult(classifier->ClassifyText("obama", {0, 5})));
EXPECT_EQ("other", FirstResult(classifier->ClassifyText("asdf", {0, 4})));
EXPECT_EQ("<INVALID RESULTS>",
FirstResult(classifier->ClassifyText("asdf", {0, 0})));
// Junk.
EXPECT_EQ("<INVALID RESULTS>",
FirstResult(classifier->ClassifyText("", {0, 0})));
EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText(
"a\n\n\n\nx x x\n\n\n\n\n\n", {1, 5})));
// Test invalid utf8 input.
EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText(
"\xf0\x9f\x98\x8b\x8b", {0, 0})));
}
TEST_P(AnnotatorTest, ClassifyTextDisabledFail) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
unpacked_model->classification_model.clear();
unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
unpacked_model->triggering_options->enabled_modes = ModeFlag_SELECTION;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
// The classification model is still needed for selection scores.
ASSERT_FALSE(classifier);
}
TEST_P(AnnotatorTest, ClassifyTextDisabled) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
unpacked_model->triggering_options->enabled_modes =
ModeFlag_ANNOTATION_AND_SELECTION;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_THAT(
classifier->ClassifyText("Call me at (800) 123-456 today", {11, 24}),
IsEmpty());
}
TEST_P(AnnotatorTest, ClassifyTextFilteredCollections) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
"Call me at (800) 123-456 today", {11, 24})));
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
unpacked_model->output_options.reset(new OutputOptionsT);
// Disable phone classification
unpacked_model->output_options->filtered_collections_classification.push_back(
"phone");
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ("other", FirstResult(classifier->ClassifyText(
"Call me at (800) 123-456 today", {11, 24})));
// Check that the address classification still passes.
EXPECT_EQ("address", FirstResult(classifier->ClassifyText(
"350 Third Street, Cambridge", {0, 27})));
}
std::unique_ptr<RegexModel_::PatternT> MakePattern(
const std::string& collection_name, const std::string& pattern,
const bool enabled_for_classification, const bool enabled_for_selection,
const bool enabled_for_annotation, const float score) {
std::unique_ptr<RegexModel_::PatternT> result(new RegexModel_::PatternT);
result->collection_name = collection_name;
result->pattern = pattern;
// We cannot directly operate with |= on the flag, so use an int here.
int enabled_modes = ModeFlag_NONE;
if (enabled_for_annotation) enabled_modes |= ModeFlag_ANNOTATION;
if (enabled_for_classification) enabled_modes |= ModeFlag_CLASSIFICATION;
if (enabled_for_selection) enabled_modes |= ModeFlag_SELECTION;
result->enabled_modes = static_cast<ModeFlag>(enabled_modes);
result->target_classification_score = score;
result->priority_score = score;
return result;
}
#ifdef TC3_UNILIB_ICU
TEST_P(AnnotatorTest, ClassifyTextRegularExpression) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Add test regex models.
unpacked_model->regex_model->patterns.push_back(MakePattern(
"person", "Barack Obama", /*enabled_for_classification=*/true,
/*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 1.0));
unpacked_model->regex_model->patterns.push_back(MakePattern(
"flight", "[a-zA-Z]{2}\\d{2,4}", /*enabled_for_classification=*/true,
/*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 0.5));
std::unique_ptr<RegexModel_::PatternT> verified_pattern =
MakePattern("payment_card", "\\d{4}(?: \\d{4}){3}",
/*enabled_for_classification=*/true,
/*enabled_for_selection=*/false,
/*enabled_for_annotation=*/false, 1.0);
verified_pattern->verification_options.reset(new VerificationOptionsT);
verified_pattern->verification_options->verify_luhn_checksum = true;
unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern));
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ("flight",
FirstResult(classifier->ClassifyText(
"Your flight LX373 is delayed by 3 hours.", {12, 17})));
EXPECT_EQ("person",
FirstResult(classifier->ClassifyText(
"this afternoon Barack Obama gave a speech at", {15, 27})));
EXPECT_EQ("email",
FirstResult(classifier->ClassifyText("you@android.com", {0, 15})));
EXPECT_EQ("email", FirstResult(classifier->ClassifyText(
"Contact me at you@android.com", {14, 29})));
EXPECT_EQ("url", FirstResult(classifier->ClassifyText(
"Visit www.google.com every today!", {6, 20})));
EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("LX 37", {0, 5})));
EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("flight LX 37 abcd",
{7, 12})));
EXPECT_EQ("payment_card", FirstResult(classifier->ClassifyText(
"cc: 4012 8888 8888 1881", {4, 23})));
EXPECT_EQ("payment_card", FirstResult(classifier->ClassifyText(
"2221 0067 4735 6281", {0, 19})));
// Luhn check fails.
EXPECT_EQ("other", FirstResult(classifier->ClassifyText("2221 0067 4735 6282",
{0, 19})));
// More lines.
EXPECT_EQ("url",
FirstResult(classifier->ClassifyText(
"this afternoon Barack Obama gave a speech at|Visit "
"www.google.com every today!|Call me at (800) 123-456 today.",
{51, 65})));
}
#endif // TC3_UNILIB_ICU
#ifdef TC3_UNILIB_ICU
TEST_P(AnnotatorTest, SuggestSelectionRegularExpression) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Add test regex models.
unpacked_model->regex_model->patterns.push_back(MakePattern(
"person", " (Barack Obama) ", /*enabled_for_classification=*/false,
/*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
unpacked_model->regex_model->patterns.push_back(MakePattern(
"flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
/*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
unpacked_model->regex_model->patterns.back()->priority_score = 1.1;
std::unique_ptr<RegexModel_::PatternT> verified_pattern =
MakePattern("payment_card", "(\\d{4}(?: \\d{4}){3})",
/*enabled_for_classification=*/false,
/*enabled_for_selection=*/true,
/*enabled_for_annotation=*/false, 1.0);
verified_pattern->verification_options.reset(new VerificationOptionsT);
verified_pattern->verification_options->verify_luhn_checksum = true;
unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern));
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
// Check regular expression selection.
EXPECT_EQ(classifier->SuggestSelection(
"Your flight MA 0123 is delayed by 3 hours.", {12, 14}),
std::make_pair(12, 19));
EXPECT_EQ(classifier->SuggestSelection(
"this afternoon Barack Obama gave a speech at", {15, 21}),
std::make_pair(15, 27));
EXPECT_EQ(classifier->SuggestSelection("cc: 4012 8888 8888 1881", {9, 14}),
std::make_pair(4, 23));
}
#endif // TC3_UNILIB_ICU
#ifdef TC3_UNILIB_ICU
TEST_P(AnnotatorTest, SuggestSelectionRegularExpressionConflictsModelWins) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Add test regex models.
unpacked_model->regex_model->patterns.push_back(MakePattern(
"person", " (Barack Obama) ", /*enabled_for_classification=*/false,
/*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
unpacked_model->regex_model->patterns.push_back(MakePattern(
"flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
/*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
unpacked_model->regex_model->patterns.back()->priority_score = 0.5;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize());
ASSERT_TRUE(classifier);
// Check conflict resolution.
EXPECT_EQ(
classifier->SuggestSelection(
"saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123",
{55, 57}),
std::make_pair(26, 62));
}
#endif // TC3_UNILIB_ICU
#ifdef TC3_UNILIB_ICU
TEST_P(AnnotatorTest, SuggestSelectionRegularExpressionConflictsRegexWins) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Add test regex models.
unpacked_model->regex_model->patterns.push_back(MakePattern(
"person", " (Barack Obama) ", /*enabled_for_classification=*/false,
/*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
unpacked_model->regex_model->patterns.push_back(MakePattern(
"flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
/*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
unpacked_model->regex_model->patterns.back()->priority_score = 1.1;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize());
ASSERT_TRUE(classifier);
// Check conflict resolution.
EXPECT_EQ(
classifier->SuggestSelection(
"saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123",
{55, 57}),
std::make_pair(55, 62));
}
#endif // TC3_UNILIB_ICU
#ifdef TC3_UNILIB_ICU
TEST_P(AnnotatorTest, AnnotateRegex) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Add test regex models.
unpacked_model->regex_model->patterns.push_back(MakePattern(
"person", " (Barack Obama) ", /*enabled_for_classification=*/false,
/*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 1.0));
unpacked_model->regex_model->patterns.push_back(MakePattern(
"flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
/*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 0.5));
std::unique_ptr<RegexModel_::PatternT> verified_pattern =
MakePattern("payment_card", "(\\d{4}(?: \\d{4}){3})",
/*enabled_for_classification=*/false,
/*enabled_for_selection=*/false,
/*enabled_for_annotation=*/true, 1.0);
verified_pattern->verification_options.reset(new VerificationOptionsT);
verified_pattern->verification_options->verify_luhn_checksum = true;
unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern));
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
const std::string test_string =
"& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
"number is 853 225 3556\nand my card is 4012 8888 8888 1881.\n";
EXPECT_THAT(classifier->Annotate(test_string),
ElementsAreArray({IsAnnotatedSpan(6, 18, "person"),
IsAnnotatedSpan(28, 55, "address"),
IsAnnotatedSpan(79, 91, "phone"),
IsAnnotatedSpan(107, 126, "payment_card")}));
}
#endif // TC3_UNILIB_ICU
TEST_P(AnnotatorTest, PhoneFiltering) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
"phone: (123) 456 789", {7, 20})));
EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
"phone: (123) 456 789,0001112", {7, 25})));
EXPECT_EQ("other", FirstResult(classifier->ClassifyText(
"phone: (123) 456 789,0001112", {7, 28})));
}
TEST_P(AnnotatorTest, SuggestSelection) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(classifier->SuggestSelection(
"this afternoon Barack Obama gave a speech at", {15, 21}),
std::make_pair(15, 21));
// Try passing whole string.
// If more than 1 token is specified, we should return back what entered.
EXPECT_EQ(
classifier->SuggestSelection("350 Third Street, Cambridge", {0, 27}),
std::make_pair(0, 27));
// Single letter.
EXPECT_EQ(classifier->SuggestSelection("a", {0, 1}), std::make_pair(0, 1));
// Single word.
EXPECT_EQ(classifier->SuggestSelection("asdf", {0, 4}), std::make_pair(0, 4));
EXPECT_EQ(
classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
std::make_pair(11, 23));
// Unpaired bracket stripping.
EXPECT_EQ(
classifier->SuggestSelection("call me at (857) 225 3556 today", {11, 16}),
std::make_pair(11, 25));
EXPECT_EQ(classifier->SuggestSelection("call me at (857 today", {11, 15}),
std::make_pair(12, 15));
EXPECT_EQ(classifier->SuggestSelection("call me at 3556) today", {11, 16}),
std::make_pair(11, 15));
EXPECT_EQ(classifier->SuggestSelection("call me at )857( today", {11, 16}),
std::make_pair(12, 15));
// If the resulting selection would be empty, the original span is returned.
EXPECT_EQ(classifier->SuggestSelection("call me at )( today", {11, 13}),
std::make_pair(11, 13));
EXPECT_EQ(classifier->SuggestSelection("call me at ( today", {11, 12}),
std::make_pair(11, 12));
EXPECT_EQ(classifier->SuggestSelection("call me at ) today", {11, 12}),
std::make_pair(11, 12));
}
TEST_P(AnnotatorTest, SuggestSelectionDisabledFail) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Disable the selection model.
unpacked_model->selection_model.clear();
unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
unpacked_model->triggering_options->enabled_modes = ModeFlag_ANNOTATION;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
// Selection model needs to be present for annotation.
ASSERT_FALSE(classifier);
}
TEST_P(AnnotatorTest, SuggestSelectionDisabled) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Disable the selection model.
unpacked_model->selection_model.clear();
unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
unpacked_model->triggering_options->enabled_modes = ModeFlag_CLASSIFICATION;
unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(
classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
std::make_pair(11, 14));
EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
"call me at (800) 123-456 today", {11, 24})));
EXPECT_THAT(classifier->Annotate("call me at (800) 123-456 today"),
IsEmpty());
}
TEST_P(AnnotatorTest, SuggestSelectionFilteredCollections) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(
classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
std::make_pair(11, 23));
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
unpacked_model->output_options.reset(new OutputOptionsT);
// Disable phone selection
unpacked_model->output_options->filtered_collections_selection.push_back(
"phone");
// We need to force this for filtering.
unpacked_model->selection_options->always_classify_suggested_selection = true;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(
classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
std::make_pair(11, 14));
// Address selection should still work.
EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}),
std::make_pair(0, 27));
}
TEST_P(AnnotatorTest, SuggestSelectionsAreSymmetric) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {0, 3}),
std::make_pair(0, 27));
EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}),
std::make_pair(0, 27));
EXPECT_EQ(
classifier->SuggestSelection("350 Third Street, Cambridge", {10, 16}),
std::make_pair(0, 27));
EXPECT_EQ(classifier->SuggestSelection("a\nb\nc\n350 Third Street, Cambridge",
{16, 22}),
std::make_pair(6, 33));
}
TEST_P(AnnotatorTest, SuggestSelectionWithNewLine) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(classifier->SuggestSelection("abc\n857 225 3556", {4, 7}),
std::make_pair(4, 16));
EXPECT_EQ(classifier->SuggestSelection("857 225 3556\nabc", {0, 3}),
std::make_pair(0, 12));
SelectionOptions options;
EXPECT_EQ(classifier->SuggestSelection("857 225\n3556\nabc", {0, 3}, options),
std::make_pair(0, 7));
}
TEST_P(AnnotatorTest, SuggestSelectionWithPunctuation) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
// From the right.
EXPECT_EQ(classifier->SuggestSelection(
"this afternoon BarackObama, gave a speech at", {15, 26}),
std::make_pair(15, 26));
// From the right multiple.
EXPECT_EQ(classifier->SuggestSelection(
"this afternoon BarackObama,.,.,, gave a speech at", {15, 26}),
std::make_pair(15, 26));
// From the left multiple.
EXPECT_EQ(classifier->SuggestSelection(
"this afternoon ,.,.,,BarackObama gave a speech at", {21, 32}),
std::make_pair(21, 32));
// From both sides.
EXPECT_EQ(classifier->SuggestSelection(
"this afternoon !BarackObama,- gave a speech at", {16, 27}),
std::make_pair(16, 27));
}
TEST_P(AnnotatorTest, SuggestSelectionNoCrashWithJunk) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
// Try passing in bunch of invalid selections.
EXPECT_EQ(classifier->SuggestSelection("", {0, 27}), std::make_pair(0, 27));
EXPECT_EQ(classifier->SuggestSelection("", {-10, 27}),
std::make_pair(-10, 27));
EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {0, 27}),
std::make_pair(0, 27));
EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-30, 300}),
std::make_pair(-30, 300));
EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-10, -1}),
std::make_pair(-10, -1));
EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {100, 17}),
std::make_pair(100, 17));
// Try passing invalid utf8.
EXPECT_EQ(classifier->SuggestSelection("\xf0\x9f\x98\x8b\x8b", {-1, -1}),
std::make_pair(-1, -1));
}
TEST_P(AnnotatorTest, SuggestSelectionSelectSpace) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(
classifier->SuggestSelection("call me at 857 225 3556 today", {14, 15}),
std::make_pair(11, 23));
EXPECT_EQ(
classifier->SuggestSelection("call me at 857 225 3556 today", {10, 11}),
std::make_pair(10, 11));
EXPECT_EQ(
classifier->SuggestSelection("call me at 857 225 3556 today", {23, 24}),
std::make_pair(23, 24));
EXPECT_EQ(
classifier->SuggestSelection("call me at 857 225 3556, today", {23, 24}),
std::make_pair(23, 24));
EXPECT_EQ(classifier->SuggestSelection("call me at 857 225 3556, today",
{14, 17}),
std::make_pair(11, 25));
EXPECT_EQ(
classifier->SuggestSelection("call me at 857-225 3556, today", {14, 17}),
std::make_pair(11, 23));
EXPECT_EQ(
classifier->SuggestSelection(
"let's meet at 350 Third Street Cambridge and go there", {30, 31}),
std::make_pair(14, 40));
EXPECT_EQ(classifier->SuggestSelection("call me today", {4, 5}),
std::make_pair(4, 5));
EXPECT_EQ(classifier->SuggestSelection("call me today", {7, 8}),
std::make_pair(7, 8));
// With a punctuation around the selected whitespace.
EXPECT_EQ(
classifier->SuggestSelection(
"let's meet at 350 Third Street, Cambridge and go there", {31, 32}),
std::make_pair(14, 41));
// When all's whitespace, should return the original indices.
EXPECT_EQ(classifier->SuggestSelection(" ", {0, 1}),
std::make_pair(0, 1));
EXPECT_EQ(classifier->SuggestSelection(" ", {0, 3}),
std::make_pair(0, 3));
EXPECT_EQ(classifier->SuggestSelection(" ", {2, 3}),
std::make_pair(2, 3));
EXPECT_EQ(classifier->SuggestSelection(" ", {5, 6}),
std::make_pair(5, 6));
}
TEST_F(AnnotatorTest, SnapLeftIfWhitespaceSelection) {
UnicodeText text;
text = UTF8ToUnicodeText("abcd efgh", /*do_copy=*/false);
EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
std::make_pair(3, 4));
text = UTF8ToUnicodeText("abcd ", /*do_copy=*/false);
EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
std::make_pair(3, 4));
// Nothing on the left.
text = UTF8ToUnicodeText(" efgh", /*do_copy=*/false);
EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
std::make_pair(4, 5));
text = UTF8ToUnicodeText(" efgh", /*do_copy=*/false);
EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({0, 1}, text, unilib_),
std::make_pair(0, 1));
// Whitespace only.
text = UTF8ToUnicodeText(" ", /*do_copy=*/false);
EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({2, 3}, text, unilib_),
std::make_pair(2, 3));
text = UTF8ToUnicodeText(" ", /*do_copy=*/false);
EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
std::make_pair(4, 5));
text = UTF8ToUnicodeText(" ", /*do_copy=*/false);
EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({0, 1}, text, unilib_),
std::make_pair(0, 1));
}
TEST_P(AnnotatorTest, Annotate) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
const std::string test_string =
"& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
"number is 853 225 3556";
EXPECT_THAT(classifier->Annotate(test_string),
ElementsAreArray({
IsAnnotatedSpan(28, 55, "address"),
IsAnnotatedSpan(79, 91, "phone"),
}));
AnnotationOptions options;
EXPECT_THAT(classifier->Annotate("853 225 3556", options),
ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")}));
EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty());
// Try passing invalid utf8.
EXPECT_TRUE(
classifier->Annotate("853 225 3556\n\xf0\x9f\x98\x8b\x8b", options)
.empty());
}
TEST_P(AnnotatorTest, AnnotateSmallBatches) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Set the batch size.
unpacked_model->selection_options->batch_size = 4;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
const std::string test_string =
"& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
"number is 853 225 3556";
EXPECT_THAT(classifier->Annotate(test_string),
ElementsAreArray({
IsAnnotatedSpan(28, 55, "address"),
IsAnnotatedSpan(79, 91, "phone"),
}));
AnnotationOptions options;
EXPECT_THAT(classifier->Annotate("853 225 3556", options),
ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")}));
EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty());
}
#ifdef TC3_UNILIB_ICU
TEST_P(AnnotatorTest, AnnotateFilteringDiscardAll) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
// Add test threshold.
unpacked_model->triggering_options->min_annotate_confidence =
2.f; // Discards all results.
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
const std::string test_string =
"& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
"number is 853 225 3556";
EXPECT_EQ(classifier->Annotate(test_string).size(), 0);
}
#endif // TC3_UNILIB_ICU
TEST_P(AnnotatorTest, AnnotateFilteringKeepAll) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Add test thresholds.
unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
unpacked_model->triggering_options->min_annotate_confidence =
0.f; // Keeps all results.
unpacked_model->triggering_options->enabled_modes = ModeFlag_ALL;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
const std::string test_string =
"& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
"number is 853 225 3556";
EXPECT_EQ(classifier->Annotate(test_string).size(), 2);
}
TEST_P(AnnotatorTest, AnnotateDisabled) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Disable the model for annotation.
unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION_AND_SELECTION;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
const std::string test_string =
"& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
"number is 853 225 3556";
EXPECT_THAT(classifier->Annotate(test_string), IsEmpty());
}
TEST_P(AnnotatorTest, AnnotateFilteredCollections) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
const std::string test_string =
"& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
"number is 853 225 3556";
EXPECT_THAT(classifier->Annotate(test_string),
ElementsAreArray({
IsAnnotatedSpan(28, 55, "address"),
IsAnnotatedSpan(79, 91, "phone"),
}));
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
unpacked_model->output_options.reset(new OutputOptionsT);
// Disable phone annotation
unpacked_model->output_options->filtered_collections_annotation.push_back(
"phone");
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_THAT(classifier->Annotate(test_string),
ElementsAreArray({
IsAnnotatedSpan(28, 55, "address"),
}));
}
#ifdef TC3_UNILIB_ICU
TEST_P(AnnotatorTest, AnnotateFilteredCollectionsSuppress) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
const std::string test_string =
"& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
"number is 853 225 3556";
EXPECT_THAT(classifier->Annotate(test_string),
ElementsAreArray({
IsAnnotatedSpan(28, 55, "address"),
IsAnnotatedSpan(79, 91, "phone"),
}));
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
unpacked_model->output_options.reset(new OutputOptionsT);
// We add a custom annotator that wins against the phone classification
// below and that we subsequently suppress.
unpacked_model->output_options->filtered_collections_annotation.push_back(
"suppress");
unpacked_model->regex_model->patterns.push_back(MakePattern(
"suppress", "(\\d{3} ?\\d{4})",
/*enabled_for_classification=*/false,
/*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 2.0));
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_THAT(classifier->Annotate(test_string),
ElementsAreArray({
IsAnnotatedSpan(28, 55, "address"),
}));
}
#endif // TC3_UNILIB_ICU
#ifdef TC3_CALENDAR_ICU
TEST_P(AnnotatorTest, ClassifyTextDate) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam());
EXPECT_TRUE(classifier);
std::vector<ClassificationResult> result;
ClassificationOptions options;
options.reference_timezone = "Europe/Zurich";
result = classifier->ClassifyText("january 1, 2017", {0, 15}, options);
ASSERT_EQ(result.size(), 1);
EXPECT_THAT(result[0].collection, "date");
EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000);
EXPECT_EQ(result[0].datetime_parse_result.granularity,
DatetimeGranularity::GRANULARITY_DAY);
result.clear();
options.reference_timezone = "America/Los_Angeles";
result = classifier->ClassifyText("march 1, 2017", {0, 13}, options);
ASSERT_EQ(result.size(), 1);
EXPECT_THAT(result[0].collection, "date");
EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1488355200000);
EXPECT_EQ(result[0].datetime_parse_result.granularity,
DatetimeGranularity::GRANULARITY_DAY);
result.clear();
options.reference_timezone = "America/Los_Angeles";
result = classifier->ClassifyText("2018/01/01 10:30:20", {0, 19}, options);
ASSERT_EQ(result.size(), 1);
EXPECT_THAT(result[0].collection, "date");
EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1514831420000);
EXPECT_EQ(result[0].datetime_parse_result.granularity,
DatetimeGranularity::GRANULARITY_SECOND);
result.clear();
// Date on another line.
options.reference_timezone = "Europe/Zurich";
result = classifier->ClassifyText(
"hello world this is the first line\n"
"january 1, 2017",
{35, 50}, options);
ASSERT_EQ(result.size(), 1);
EXPECT_THAT(result[0].collection, "date");
EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000);
EXPECT_EQ(result[0].datetime_parse_result.granularity,
DatetimeGranularity::GRANULARITY_DAY);
}
#endif // TC3_UNILIB_ICU
#ifdef TC3_CALENDAR_ICU
TEST_P(AnnotatorTest, ClassifyTextDatePriorities) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam());
EXPECT_TRUE(classifier);
std::vector<ClassificationResult> result;
ClassificationOptions options;
result.clear();
options.reference_timezone = "Europe/Zurich";
options.locales = "en-US";
result = classifier->ClassifyText("03.05.1970", {0, 10}, options);
ASSERT_EQ(result.size(), 1);
EXPECT_THAT(result[0].collection, "date");
EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 5439600000);
EXPECT_EQ(result[0].datetime_parse_result.granularity,
DatetimeGranularity::GRANULARITY_DAY);
result.clear();
options.reference_timezone = "Europe/Zurich";
options.locales = "de";
result = classifier->ClassifyText("03.05.1970", {0, 10}, options);
ASSERT_EQ(result.size(), 1);
EXPECT_THAT(result[0].collection, "date");
EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 10537200000);
EXPECT_EQ(result[0].datetime_parse_result.granularity,
DatetimeGranularity::GRANULARITY_DAY);
}
#endif // TC3_UNILIB_ICU
#ifdef TC3_CALENDAR_ICU
TEST_P(AnnotatorTest, SuggestTextDateDisabled) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
// Disable the patterns for selection.
for (int i = 0; i < unpacked_model->datetime_model->patterns.size(); i++) {
unpacked_model->datetime_model->patterns[i]->enabled_modes =
ModeFlag_ANNOTATION_AND_CLASSIFICATION;
}
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ("date",
FirstResult(classifier->ClassifyText("january 1, 2017", {0, 15})));
EXPECT_EQ(classifier->SuggestSelection("january 1, 2017", {0, 7}),
std::make_pair(0, 7));
EXPECT_THAT(classifier->Annotate("january 1, 2017"),
ElementsAreArray({IsAnnotatedSpan(0, 15, "date")}));
}
#endif // TC3_UNILIB_ICU
class TestingAnnotator : public Annotator {
public:
TestingAnnotator(const std::string& model, const UniLib* unilib,
const CalendarLib* calendarlib)
: Annotator(ViewModel(model.data(), model.size()), unilib, calendarlib) {}
using Annotator::ResolveConflicts;
};
AnnotatedSpan MakeAnnotatedSpan(CodepointSpan span,
const std::string& collection,
const float score) {
AnnotatedSpan result;
result.span = span;
result.classification.push_back({collection, score});
return result;
}
TEST_F(AnnotatorTest, ResolveConflictsTrivial) {
TestingAnnotator classifier("", &unilib_, &calendarlib_);
std::vector<AnnotatedSpan> candidates{
{MakeAnnotatedSpan({0, 1}, "phone", 1.0)}};
std::vector<int> chosen;
classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
/*interpreter_manager=*/nullptr, &chosen);
EXPECT_THAT(chosen, ElementsAreArray({0}));
}
TEST_F(AnnotatorTest, ResolveConflictsSequence) {
TestingAnnotator classifier("", &unilib_, &calendarlib_);
std::vector<AnnotatedSpan> candidates{{
MakeAnnotatedSpan({0, 1}, "phone", 1.0),
MakeAnnotatedSpan({1, 2}, "phone", 1.0),
MakeAnnotatedSpan({2, 3}, "phone", 1.0),
MakeAnnotatedSpan({3, 4}, "phone", 1.0),
MakeAnnotatedSpan({4, 5}, "phone", 1.0),
}};
std::vector<int> chosen;
classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
/*interpreter_manager=*/nullptr, &chosen);
EXPECT_THAT(chosen, ElementsAreArray({0, 1, 2, 3, 4}));
}
TEST_F(AnnotatorTest, ResolveConflictsThreeSpans) {
TestingAnnotator classifier("", &unilib_, &calendarlib_);
std::vector<AnnotatedSpan> candidates{{
MakeAnnotatedSpan({0, 3}, "phone", 1.0),
MakeAnnotatedSpan({1, 5}, "phone", 0.5), // Looser!
MakeAnnotatedSpan({3, 7}, "phone", 1.0),
}};
std::vector<int> chosen;
classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
/*interpreter_manager=*/nullptr, &chosen);
EXPECT_THAT(chosen, ElementsAreArray({0, 2}));
}
TEST_F(AnnotatorTest, ResolveConflictsThreeSpansReversed) {
TestingAnnotator classifier("", &unilib_, &calendarlib_);
std::vector<AnnotatedSpan> candidates{{
MakeAnnotatedSpan({0, 3}, "phone", 0.5), // Looser!
MakeAnnotatedSpan({1, 5}, "phone", 1.0),
MakeAnnotatedSpan({3, 7}, "phone", 0.6), // Looser!
}};
std::vector<int> chosen;
classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
/*interpreter_manager=*/nullptr, &chosen);
EXPECT_THAT(chosen, ElementsAreArray({1}));
}
TEST_F(AnnotatorTest, ResolveConflictsFiveSpans) {
TestingAnnotator classifier("", &unilib_, &calendarlib_);
std::vector<AnnotatedSpan> candidates{{
MakeAnnotatedSpan({0, 3}, "phone", 0.5),
MakeAnnotatedSpan({1, 5}, "other", 1.0), // Looser!
MakeAnnotatedSpan({3, 7}, "phone", 0.6),
MakeAnnotatedSpan({8, 12}, "phone", 0.6), // Looser!
MakeAnnotatedSpan({11, 15}, "phone", 0.9),
}};
std::vector<int> chosen;
classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
/*interpreter_manager=*/nullptr, &chosen);
EXPECT_THAT(chosen, ElementsAreArray({0, 2, 4}));
}
#ifdef TC3_UNILIB_ICU
TEST_P(AnnotatorTest, LongInput) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
for (const auto& type_value_pair :
std::vector<std::pair<std::string, std::string>>{
{"address", "350 Third Street, Cambridge"},
{"phone", "123 456-7890"},
{"url", "www.google.com"},
{"email", "someone@gmail.com"},
{"flight", "LX 38"},
{"date", "September 1, 2018"}}) {
const std::string input_100k = std::string(50000, ' ') +
type_value_pair.second +
std::string(50000, ' ');
const int value_length = type_value_pair.second.size();
EXPECT_THAT(classifier->Annotate(input_100k),
ElementsAreArray({IsAnnotatedSpan(50000, 50000 + value_length,
type_value_pair.first)}));
EXPECT_EQ(classifier->SuggestSelection(input_100k, {50000, 50001}),
std::make_pair(50000, 50000 + value_length));
EXPECT_EQ(type_value_pair.first,
FirstResult(classifier->ClassifyText(
input_100k, {50000, 50000 + value_length})));
}
}
#endif // TC3_UNILIB_ICU
#ifdef TC3_UNILIB_ICU
// These coarse tests are there only to make sure the execution happens in
// reasonable amount of time.
TEST_P(AnnotatorTest, LongInputNoResultCheck) {
std::unique_ptr<Annotator> classifier =
Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
for (const std::string& value :
std::vector<std::string>{"http://www.aaaaaaaaaaaaaaaaaaaa.com "}) {
const std::string input_100k =
std::string(50000, ' ') + value + std::string(50000, ' ');
const int value_length = value.size();
classifier->Annotate(input_100k);
classifier->SuggestSelection(input_100k, {50000, 50001});
classifier->ClassifyText(input_100k, {50000, 50000 + value_length});
}
}
#endif // TC3_UNILIB_ICU
#ifdef TC3_UNILIB_ICU
TEST_P(AnnotatorTest, MaxTokenLength) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
std::unique_ptr<Annotator> classifier;
// With unrestricted number of tokens should behave normally.
unpacked_model->classification_options->max_num_tokens = -1;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(FirstResult(classifier->ClassifyText(
"I live at 350 Third Street, Cambridge.", {10, 37})),
"address");
// Raise the maximum number of tokens to suppress the classification.
unpacked_model->classification_options->max_num_tokens = 3;
flatbuffers::FlatBufferBuilder builder2;
FinishModelBuffer(builder2, Model::Pack(builder2, unpacked_model.get()));
classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder2.GetBufferPointer()),
builder2.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(FirstResult(classifier->ClassifyText(
"I live at 350 Third Street, Cambridge.", {10, 37})),
"other");
}
#endif // TC3_UNILIB_ICU
#ifdef TC3_UNILIB_ICU
TEST_P(AnnotatorTest, MinAddressTokenLength) {
const std::string test_model = ReadFile(GetModelPath() + GetParam());
std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
std::unique_ptr<Annotator> classifier;
// With unrestricted number of address tokens should behave normally.
unpacked_model->classification_options->address_min_num_tokens = 0;
flatbuffers::FlatBufferBuilder builder;
FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(FirstResult(classifier->ClassifyText(
"I live at 350 Third Street, Cambridge.", {10, 37})),
"address");
// Raise number of address tokens to suppress the address classification.
unpacked_model->classification_options->address_min_num_tokens = 5;
flatbuffers::FlatBufferBuilder builder2;
FinishModelBuffer(builder2, Model::Pack(builder2, unpacked_model.get()));
classifier = Annotator::FromUnownedBuffer(
reinterpret_cast<const char*>(builder2.GetBufferPointer()),
builder2.GetSize(), &unilib_, &calendarlib_);
ASSERT_TRUE(classifier);
EXPECT_EQ(FirstResult(classifier->ClassifyText(
"I live at 350 Third Street, Cambridge.", {10, 37})),
"other");
}
#endif // TC3_UNILIB_ICU
} // namespace
} // namespace libtextclassifier3