blob: fbaf039b736e54bcfa78ed500eec888d99dbd588 [file] [log] [blame]
Lukas Zilka21d8c982018-01-24 11:11:20 +01001/*
Tony Mak6c4cc672018-09-17 11:48:50 +01002 * Copyright (C) 2018 The Android Open Source Project
Lukas Zilka21d8c982018-01-24 11:11:20 +01003 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Tony Mak6c4cc672018-09-17 11:48:50 +010017#include "annotator/annotator.h"
Lukas Zilka21d8c982018-01-24 11:11:20 +010018
19#include <fstream>
20#include <iostream>
21#include <memory>
22#include <string>
23
Tony Mak6c4cc672018-09-17 11:48:50 +010024#include "annotator/model_generated.h"
25#include "annotator/types-test-util.h"
Lukas Zilka21d8c982018-01-24 11:11:20 +010026#include "gmock/gmock.h"
27#include "gtest/gtest.h"
28
Tony Mak6c4cc672018-09-17 11:48:50 +010029namespace libtextclassifier3 {
Lukas Zilka21d8c982018-01-24 11:11:20 +010030namespace {
31
32using testing::ElementsAreArray;
Lukas Zilkaba849e72018-03-08 14:48:21 +010033using testing::IsEmpty;
Lukas Zilka21d8c982018-01-24 11:11:20 +010034using testing::Pair;
Lukas Zilkab23e2122018-02-09 10:25:19 +010035using testing::Values;
Lukas Zilka21d8c982018-01-24 11:11:20 +010036
Lukas Zilkab23e2122018-02-09 10:25:19 +010037std::string FirstResult(const std::vector<ClassificationResult>& results) {
Lukas Zilka21d8c982018-01-24 11:11:20 +010038 if (results.empty()) {
39 return "<INVALID RESULTS>";
40 }
Lukas Zilkab23e2122018-02-09 10:25:19 +010041 return results[0].collection;
Lukas Zilka21d8c982018-01-24 11:11:20 +010042}
43
44MATCHER_P3(IsAnnotatedSpan, start, end, best_class, "") {
45 return testing::Value(arg.span, Pair(start, end)) &&
46 testing::Value(FirstResult(arg.classification), best_class);
47}
48
49std::string ReadFile(const std::string& file_name) {
50 std::ifstream file_stream(file_name);
51 return std::string(std::istreambuf_iterator<char>(file_stream), {});
52}
53
54std::string GetModelPath() {
Tony Maka0f598b2018-11-20 20:39:04 +000055 return TC3_TEST_DATA_DIR;
Lukas Zilka21d8c982018-01-24 11:11:20 +010056}
57
Tony Mak6c4cc672018-09-17 11:48:50 +010058class AnnotatorTest : public ::testing::TestWithParam<const char*> {
59 protected:
60 AnnotatorTest()
61 : INIT_UNILIB_FOR_TESTING(unilib_),
62 INIT_CALENDARLIB_FOR_TESTING(calendarlib_) {}
63 UniLib unilib_;
64 CalendarLib calendarlib_;
65};
66
67TEST_F(AnnotatorTest, EmbeddingExecutorLoadingFails) {
68 std::unique_ptr<Annotator> classifier = Annotator::FromPath(
69 GetModelPath() + "wrong_embeddings.fb", &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +010070 EXPECT_FALSE(classifier);
71}
72
Tony Mak6c4cc672018-09-17 11:48:50 +010073INSTANTIATE_TEST_CASE_P(ClickContext, AnnotatorTest,
Lukas Zilkab23e2122018-02-09 10:25:19 +010074 Values("test_model_cc.fb"));
Tony Mak6c4cc672018-09-17 11:48:50 +010075INSTANTIATE_TEST_CASE_P(BoundsSensitive, AnnotatorTest,
Lukas Zilkab23e2122018-02-09 10:25:19 +010076 Values("test_model.fb"));
77
Tony Mak6c4cc672018-09-17 11:48:50 +010078TEST_P(AnnotatorTest, ClassifyText) {
79 std::unique_ptr<Annotator> classifier =
80 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +010081 ASSERT_TRUE(classifier);
82
83 EXPECT_EQ("other",
84 FirstResult(classifier->ClassifyText(
85 "this afternoon Barack Obama gave a speech at", {15, 27})));
Lukas Zilka21d8c982018-01-24 11:11:20 +010086 EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
87 "Call me at (800) 123-456 today", {11, 24})));
Lukas Zilka21d8c982018-01-24 11:11:20 +010088
89 // More lines.
90 EXPECT_EQ("other",
91 FirstResult(classifier->ClassifyText(
92 "this afternoon Barack Obama gave a speech at|Visit "
93 "www.google.com every today!|Call me at (800) 123-456 today.",
94 {15, 27})));
Lukas Zilka21d8c982018-01-24 11:11:20 +010095 EXPECT_EQ("phone",
96 FirstResult(classifier->ClassifyText(
97 "this afternoon Barack Obama gave a speech at|Visit "
98 "www.google.com every today!|Call me at (800) 123-456 today.",
99 {90, 103})));
100
101 // Single word.
102 EXPECT_EQ("other", FirstResult(classifier->ClassifyText("obama", {0, 5})));
103 EXPECT_EQ("other", FirstResult(classifier->ClassifyText("asdf", {0, 4})));
104 EXPECT_EQ("<INVALID RESULTS>",
105 FirstResult(classifier->ClassifyText("asdf", {0, 0})));
106
107 // Junk.
108 EXPECT_EQ("<INVALID RESULTS>",
109 FirstResult(classifier->ClassifyText("", {0, 0})));
110 EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText(
111 "a\n\n\n\nx x x\n\n\n\n\n\n", {1, 5})));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200112 // Test invalid utf8 input.
113 EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText(
114 "\xf0\x9f\x98\x8b\x8b", {0, 0})));
Lukas Zilka21d8c982018-01-24 11:11:20 +0100115}
116
Tony Mak6c4cc672018-09-17 11:48:50 +0100117TEST_P(AnnotatorTest, ClassifyTextDisabledFail) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100118 const std::string test_model = ReadFile(GetModelPath() + GetParam());
119 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
120
121 unpacked_model->classification_model.clear();
122 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
123 unpacked_model->triggering_options->enabled_modes = ModeFlag_SELECTION;
124
125 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000126 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100127
Tony Mak6c4cc672018-09-17 11:48:50 +0100128 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
129 reinterpret_cast<const char*>(builder.GetBufferPointer()),
130 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100131
132 // The classification model is still needed for selection scores.
133 ASSERT_FALSE(classifier);
134}
135
Tony Mak6c4cc672018-09-17 11:48:50 +0100136TEST_P(AnnotatorTest, ClassifyTextDisabled) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100137 const std::string test_model = ReadFile(GetModelPath() + GetParam());
138 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
139
140 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
141 unpacked_model->triggering_options->enabled_modes =
142 ModeFlag_ANNOTATION_AND_SELECTION;
143
144 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000145 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100146
Tony Mak6c4cc672018-09-17 11:48:50 +0100147 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
148 reinterpret_cast<const char*>(builder.GetBufferPointer()),
149 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100150 ASSERT_TRUE(classifier);
151
152 EXPECT_THAT(
153 classifier->ClassifyText("Call me at (800) 123-456 today", {11, 24}),
154 IsEmpty());
155}
156
Tony Mak6c4cc672018-09-17 11:48:50 +0100157TEST_P(AnnotatorTest, ClassifyTextFilteredCollections) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200158 const std::string test_model = ReadFile(GetModelPath() + GetParam());
159
Tony Mak6c4cc672018-09-17 11:48:50 +0100160 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
161 test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200162 ASSERT_TRUE(classifier);
163
164 EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
165 "Call me at (800) 123-456 today", {11, 24})));
166
167 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
168 unpacked_model->output_options.reset(new OutputOptionsT);
169
170 // Disable phone classification
171 unpacked_model->output_options->filtered_collections_classification.push_back(
172 "phone");
173
174 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000175 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200176
Tony Mak6c4cc672018-09-17 11:48:50 +0100177 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200178 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100179 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200180 ASSERT_TRUE(classifier);
181
182 EXPECT_EQ("other", FirstResult(classifier->ClassifyText(
183 "Call me at (800) 123-456 today", {11, 24})));
184
185 // Check that the address classification still passes.
186 EXPECT_EQ("address", FirstResult(classifier->ClassifyText(
187 "350 Third Street, Cambridge", {0, 27})));
188}
189
Lukas Zilkab23e2122018-02-09 10:25:19 +0100190std::unique_ptr<RegexModel_::PatternT> MakePattern(
191 const std::string& collection_name, const std::string& pattern,
192 const bool enabled_for_classification, const bool enabled_for_selection,
193 const bool enabled_for_annotation, const float score) {
194 std::unique_ptr<RegexModel_::PatternT> result(new RegexModel_::PatternT);
195 result->collection_name = collection_name;
196 result->pattern = pattern;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100197 // We cannot directly operate with |= on the flag, so use an int here.
198 int enabled_modes = ModeFlag_NONE;
199 if (enabled_for_annotation) enabled_modes |= ModeFlag_ANNOTATION;
200 if (enabled_for_classification) enabled_modes |= ModeFlag_CLASSIFICATION;
201 if (enabled_for_selection) enabled_modes |= ModeFlag_SELECTION;
202 result->enabled_modes = static_cast<ModeFlag>(enabled_modes);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100203 result->target_classification_score = score;
204 result->priority_score = score;
205 return result;
206}
207
Tony Maka0f598b2018-11-20 20:39:04 +0000208#ifdef TC3_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100209TEST_P(AnnotatorTest, ClassifyTextRegularExpression) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100210 const std::string test_model = ReadFile(GetModelPath() + GetParam());
211 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
212
213 // Add test regex models.
214 unpacked_model->regex_model->patterns.push_back(MakePattern(
215 "person", "Barack Obama", /*enabled_for_classification=*/true,
216 /*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 1.0));
217 unpacked_model->regex_model->patterns.push_back(MakePattern(
218 "flight", "[a-zA-Z]{2}\\d{2,4}", /*enabled_for_classification=*/true,
219 /*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 0.5));
Tony Mak6c4cc672018-09-17 11:48:50 +0100220 std::unique_ptr<RegexModel_::PatternT> verified_pattern =
221 MakePattern("payment_card", "\\d{4}(?: \\d{4}){3}",
222 /*enabled_for_classification=*/true,
223 /*enabled_for_selection=*/false,
224 /*enabled_for_annotation=*/false, 1.0);
225 verified_pattern->verification_options.reset(new VerificationOptionsT);
226 verified_pattern->verification_options->verify_luhn_checksum = true;
227 unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100228
229 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000230 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100231
Tony Mak6c4cc672018-09-17 11:48:50 +0100232 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
233 reinterpret_cast<const char*>(builder.GetBufferPointer()),
234 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100235 ASSERT_TRUE(classifier);
236
237 EXPECT_EQ("flight",
238 FirstResult(classifier->ClassifyText(
239 "Your flight LX373 is delayed by 3 hours.", {12, 17})));
240 EXPECT_EQ("person",
241 FirstResult(classifier->ClassifyText(
242 "this afternoon Barack Obama gave a speech at", {15, 27})));
243 EXPECT_EQ("email",
244 FirstResult(classifier->ClassifyText("you@android.com", {0, 15})));
245 EXPECT_EQ("email", FirstResult(classifier->ClassifyText(
246 "Contact me at you@android.com", {14, 29})));
247
248 EXPECT_EQ("url", FirstResult(classifier->ClassifyText(
249 "Visit www.google.com every today!", {6, 20})));
250
251 EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("LX 37", {0, 5})));
252 EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("flight LX 37 abcd",
253 {7, 12})));
Tony Mak6c4cc672018-09-17 11:48:50 +0100254 EXPECT_EQ("payment_card", FirstResult(classifier->ClassifyText(
255 "cc: 4012 8888 8888 1881", {4, 23})));
256 EXPECT_EQ("payment_card", FirstResult(classifier->ClassifyText(
257 "2221 0067 4735 6281", {0, 19})));
258 // Luhn check fails.
259 EXPECT_EQ("other", FirstResult(classifier->ClassifyText("2221 0067 4735 6282",
260 {0, 19})));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100261
262 // More lines.
263 EXPECT_EQ("url",
264 FirstResult(classifier->ClassifyText(
265 "this afternoon Barack Obama gave a speech at|Visit "
266 "www.google.com every today!|Call me at (800) 123-456 today.",
267 {51, 65})));
268}
Tony Maka0f598b2018-11-20 20:39:04 +0000269#endif // TC3_UNILIB_ICU
Lukas Zilkab23e2122018-02-09 10:25:19 +0100270
Tony Maka0f598b2018-11-20 20:39:04 +0000271#ifdef TC3_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100272TEST_P(AnnotatorTest, SuggestSelectionRegularExpression) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100273 const std::string test_model = ReadFile(GetModelPath() + GetParam());
274 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
275
276 // Add test regex models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100277 unpacked_model->regex_model->patterns.push_back(MakePattern(
278 "person", " (Barack Obama) ", /*enabled_for_classification=*/false,
279 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
280 unpacked_model->regex_model->patterns.push_back(MakePattern(
281 "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
282 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
283 unpacked_model->regex_model->patterns.back()->priority_score = 1.1;
Tony Mak6c4cc672018-09-17 11:48:50 +0100284 std::unique_ptr<RegexModel_::PatternT> verified_pattern =
285 MakePattern("payment_card", "(\\d{4}(?: \\d{4}){3})",
286 /*enabled_for_classification=*/false,
287 /*enabled_for_selection=*/true,
288 /*enabled_for_annotation=*/false, 1.0);
289 verified_pattern->verification_options.reset(new VerificationOptionsT);
290 verified_pattern->verification_options->verify_luhn_checksum = true;
291 unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100292
293 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000294 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100295
Tony Mak6c4cc672018-09-17 11:48:50 +0100296 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
297 reinterpret_cast<const char*>(builder.GetBufferPointer()),
298 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100299 ASSERT_TRUE(classifier);
300
301 // Check regular expression selection.
302 EXPECT_EQ(classifier->SuggestSelection(
303 "Your flight MA 0123 is delayed by 3 hours.", {12, 14}),
304 std::make_pair(12, 19));
305 EXPECT_EQ(classifier->SuggestSelection(
306 "this afternoon Barack Obama gave a speech at", {15, 21}),
307 std::make_pair(15, 27));
Tony Mak6c4cc672018-09-17 11:48:50 +0100308 EXPECT_EQ(classifier->SuggestSelection("cc: 4012 8888 8888 1881", {9, 14}),
309 std::make_pair(4, 23));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100310}
Tony Maka0f598b2018-11-20 20:39:04 +0000311#endif // TC3_UNILIB_ICU
Lukas Zilkab23e2122018-02-09 10:25:19 +0100312
Tony Maka0f598b2018-11-20 20:39:04 +0000313#ifdef TC3_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100314TEST_P(AnnotatorTest, SuggestSelectionRegularExpressionConflictsModelWins) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100315 const std::string test_model = ReadFile(GetModelPath() + GetParam());
316 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
317
318 // Add test regex models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100319 unpacked_model->regex_model->patterns.push_back(MakePattern(
320 "person", " (Barack Obama) ", /*enabled_for_classification=*/false,
321 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
322 unpacked_model->regex_model->patterns.push_back(MakePattern(
323 "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
324 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
325 unpacked_model->regex_model->patterns.back()->priority_score = 0.5;
326
327 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000328 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100329
Tony Mak6c4cc672018-09-17 11:48:50 +0100330 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
331 reinterpret_cast<const char*>(builder.GetBufferPointer()),
332 builder.GetSize());
Lukas Zilkab23e2122018-02-09 10:25:19 +0100333 ASSERT_TRUE(classifier);
334
335 // Check conflict resolution.
336 EXPECT_EQ(
337 classifier->SuggestSelection(
338 "saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123",
339 {55, 57}),
340 std::make_pair(26, 62));
341}
Tony Maka0f598b2018-11-20 20:39:04 +0000342#endif // TC3_UNILIB_ICU
Lukas Zilkab23e2122018-02-09 10:25:19 +0100343
Tony Maka0f598b2018-11-20 20:39:04 +0000344#ifdef TC3_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100345TEST_P(AnnotatorTest, SuggestSelectionRegularExpressionConflictsRegexWins) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100346 const std::string test_model = ReadFile(GetModelPath() + GetParam());
347 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
348
349 // Add test regex models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100350 unpacked_model->regex_model->patterns.push_back(MakePattern(
351 "person", " (Barack Obama) ", /*enabled_for_classification=*/false,
352 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
353 unpacked_model->regex_model->patterns.push_back(MakePattern(
354 "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
355 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
356 unpacked_model->regex_model->patterns.back()->priority_score = 1.1;
357
358 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000359 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100360
Tony Mak6c4cc672018-09-17 11:48:50 +0100361 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
362 reinterpret_cast<const char*>(builder.GetBufferPointer()),
363 builder.GetSize());
Lukas Zilkab23e2122018-02-09 10:25:19 +0100364 ASSERT_TRUE(classifier);
365
366 // Check conflict resolution.
367 EXPECT_EQ(
368 classifier->SuggestSelection(
369 "saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123",
370 {55, 57}),
371 std::make_pair(55, 62));
372}
Tony Maka0f598b2018-11-20 20:39:04 +0000373#endif // TC3_UNILIB_ICU
Lukas Zilkab23e2122018-02-09 10:25:19 +0100374
Tony Maka0f598b2018-11-20 20:39:04 +0000375#ifdef TC3_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100376TEST_P(AnnotatorTest, AnnotateRegex) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100377 const std::string test_model = ReadFile(GetModelPath() + GetParam());
378 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
379
380 // Add test regex models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100381 unpacked_model->regex_model->patterns.push_back(MakePattern(
382 "person", " (Barack Obama) ", /*enabled_for_classification=*/false,
383 /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 1.0));
384 unpacked_model->regex_model->patterns.push_back(MakePattern(
385 "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
386 /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 0.5));
Tony Mak6c4cc672018-09-17 11:48:50 +0100387 std::unique_ptr<RegexModel_::PatternT> verified_pattern =
388 MakePattern("payment_card", "(\\d{4}(?: \\d{4}){3})",
389 /*enabled_for_classification=*/false,
390 /*enabled_for_selection=*/false,
391 /*enabled_for_annotation=*/true, 1.0);
392 verified_pattern->verification_options.reset(new VerificationOptionsT);
393 verified_pattern->verification_options->verify_luhn_checksum = true;
394 unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100395 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000396 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100397
Tony Mak6c4cc672018-09-17 11:48:50 +0100398 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
399 reinterpret_cast<const char*>(builder.GetBufferPointer()),
400 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100401 ASSERT_TRUE(classifier);
402
403 const std::string test_string =
404 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
Tony Mak6c4cc672018-09-17 11:48:50 +0100405 "number is 853 225 3556\nand my card is 4012 8888 8888 1881.\n";
Lukas Zilkab23e2122018-02-09 10:25:19 +0100406 EXPECT_THAT(classifier->Annotate(test_string),
Tony Mak6c4cc672018-09-17 11:48:50 +0100407 ElementsAreArray({IsAnnotatedSpan(6, 18, "person"),
408 IsAnnotatedSpan(28, 55, "address"),
409 IsAnnotatedSpan(79, 91, "phone"),
410 IsAnnotatedSpan(107, 126, "payment_card")}));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100411}
Tony Maka0f598b2018-11-20 20:39:04 +0000412#endif // TC3_UNILIB_ICU
Lukas Zilkab23e2122018-02-09 10:25:19 +0100413
Tony Mak6c4cc672018-09-17 11:48:50 +0100414TEST_P(AnnotatorTest, PhoneFiltering) {
415 std::unique_ptr<Annotator> classifier =
416 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100417 ASSERT_TRUE(classifier);
418
419 EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
420 "phone: (123) 456 789", {7, 20})));
421 EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
422 "phone: (123) 456 789,0001112", {7, 25})));
423 EXPECT_EQ("other", FirstResult(classifier->ClassifyText(
424 "phone: (123) 456 789,0001112", {7, 28})));
425}
426
Tony Mak6c4cc672018-09-17 11:48:50 +0100427TEST_P(AnnotatorTest, SuggestSelection) {
428 std::unique_ptr<Annotator> classifier =
429 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100430 ASSERT_TRUE(classifier);
431
432 EXPECT_EQ(classifier->SuggestSelection(
433 "this afternoon Barack Obama gave a speech at", {15, 21}),
434 std::make_pair(15, 21));
435
436 // Try passing whole string.
437 // If more than 1 token is specified, we should return back what entered.
438 EXPECT_EQ(
439 classifier->SuggestSelection("350 Third Street, Cambridge", {0, 27}),
440 std::make_pair(0, 27));
441
442 // Single letter.
443 EXPECT_EQ(classifier->SuggestSelection("a", {0, 1}), std::make_pair(0, 1));
444
445 // Single word.
446 EXPECT_EQ(classifier->SuggestSelection("asdf", {0, 4}), std::make_pair(0, 4));
447
448 EXPECT_EQ(
449 classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
450 std::make_pair(11, 23));
451
452 // Unpaired bracket stripping.
453 EXPECT_EQ(
454 classifier->SuggestSelection("call me at (857) 225 3556 today", {11, 16}),
455 std::make_pair(11, 25));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100456 EXPECT_EQ(classifier->SuggestSelection("call me at (857 today", {11, 15}),
457 std::make_pair(12, 15));
458 EXPECT_EQ(classifier->SuggestSelection("call me at 3556) today", {11, 16}),
459 std::make_pair(11, 15));
460 EXPECT_EQ(classifier->SuggestSelection("call me at )857( today", {11, 16}),
461 std::make_pair(12, 15));
Lukas Zilka21d8c982018-01-24 11:11:20 +0100462
463 // If the resulting selection would be empty, the original span is returned.
464 EXPECT_EQ(classifier->SuggestSelection("call me at )( today", {11, 13}),
465 std::make_pair(11, 13));
466 EXPECT_EQ(classifier->SuggestSelection("call me at ( today", {11, 12}),
467 std::make_pair(11, 12));
468 EXPECT_EQ(classifier->SuggestSelection("call me at ) today", {11, 12}),
469 std::make_pair(11, 12));
470}
471
Tony Mak6c4cc672018-09-17 11:48:50 +0100472TEST_P(AnnotatorTest, SuggestSelectionDisabledFail) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100473 const std::string test_model = ReadFile(GetModelPath() + GetParam());
474 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
475
476 // Disable the selection model.
477 unpacked_model->selection_model.clear();
478 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
479 unpacked_model->triggering_options->enabled_modes = ModeFlag_ANNOTATION;
480
481 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000482 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100483
Tony Mak6c4cc672018-09-17 11:48:50 +0100484 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
485 reinterpret_cast<const char*>(builder.GetBufferPointer()),
486 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100487 // Selection model needs to be present for annotation.
488 ASSERT_FALSE(classifier);
489}
490
Tony Mak6c4cc672018-09-17 11:48:50 +0100491TEST_P(AnnotatorTest, SuggestSelectionDisabled) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100492 const std::string test_model = ReadFile(GetModelPath() + GetParam());
493 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
494
495 // Disable the selection model.
496 unpacked_model->selection_model.clear();
497 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
498 unpacked_model->triggering_options->enabled_modes = ModeFlag_CLASSIFICATION;
499 unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION;
500
501 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000502 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100503
Tony Mak6c4cc672018-09-17 11:48:50 +0100504 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
505 reinterpret_cast<const char*>(builder.GetBufferPointer()),
506 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100507 ASSERT_TRUE(classifier);
508
509 EXPECT_EQ(
510 classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
511 std::make_pair(11, 14));
512
513 EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
514 "call me at (800) 123-456 today", {11, 24})));
515
516 EXPECT_THAT(classifier->Annotate("call me at (800) 123-456 today"),
517 IsEmpty());
518}
519
Tony Mak6c4cc672018-09-17 11:48:50 +0100520TEST_P(AnnotatorTest, SuggestSelectionFilteredCollections) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200521 const std::string test_model = ReadFile(GetModelPath() + GetParam());
522
Tony Mak6c4cc672018-09-17 11:48:50 +0100523 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
524 test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200525 ASSERT_TRUE(classifier);
526
527 EXPECT_EQ(
528 classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
529 std::make_pair(11, 23));
530
531 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
532 unpacked_model->output_options.reset(new OutputOptionsT);
533
534 // Disable phone selection
535 unpacked_model->output_options->filtered_collections_selection.push_back(
536 "phone");
537 // We need to force this for filtering.
538 unpacked_model->selection_options->always_classify_suggested_selection = true;
539
540 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000541 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200542
Tony Mak6c4cc672018-09-17 11:48:50 +0100543 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200544 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100545 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200546 ASSERT_TRUE(classifier);
547
548 EXPECT_EQ(
549 classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
550 std::make_pair(11, 14));
551
552 // Address selection should still work.
553 EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}),
554 std::make_pair(0, 27));
555}
556
Tony Mak6c4cc672018-09-17 11:48:50 +0100557TEST_P(AnnotatorTest, SuggestSelectionsAreSymmetric) {
558 std::unique_ptr<Annotator> classifier =
559 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100560 ASSERT_TRUE(classifier);
561
562 EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {0, 3}),
563 std::make_pair(0, 27));
564 EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}),
565 std::make_pair(0, 27));
566 EXPECT_EQ(
567 classifier->SuggestSelection("350 Third Street, Cambridge", {10, 16}),
568 std::make_pair(0, 27));
569 EXPECT_EQ(classifier->SuggestSelection("a\nb\nc\n350 Third Street, Cambridge",
570 {16, 22}),
571 std::make_pair(6, 33));
572}
573
Tony Mak6c4cc672018-09-17 11:48:50 +0100574TEST_P(AnnotatorTest, SuggestSelectionWithNewLine) {
575 std::unique_ptr<Annotator> classifier =
576 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100577 ASSERT_TRUE(classifier);
578
579 EXPECT_EQ(classifier->SuggestSelection("abc\n857 225 3556", {4, 7}),
580 std::make_pair(4, 16));
581 EXPECT_EQ(classifier->SuggestSelection("857 225 3556\nabc", {0, 3}),
582 std::make_pair(0, 12));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100583
584 SelectionOptions options;
585 EXPECT_EQ(classifier->SuggestSelection("857 225\n3556\nabc", {0, 3}, options),
586 std::make_pair(0, 7));
Lukas Zilka21d8c982018-01-24 11:11:20 +0100587}
588
Tony Mak6c4cc672018-09-17 11:48:50 +0100589TEST_P(AnnotatorTest, SuggestSelectionWithPunctuation) {
590 std::unique_ptr<Annotator> classifier =
591 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100592 ASSERT_TRUE(classifier);
593
594 // From the right.
595 EXPECT_EQ(classifier->SuggestSelection(
596 "this afternoon BarackObama, gave a speech at", {15, 26}),
597 std::make_pair(15, 26));
598
599 // From the right multiple.
600 EXPECT_EQ(classifier->SuggestSelection(
601 "this afternoon BarackObama,.,.,, gave a speech at", {15, 26}),
602 std::make_pair(15, 26));
603
604 // From the left multiple.
605 EXPECT_EQ(classifier->SuggestSelection(
606 "this afternoon ,.,.,,BarackObama gave a speech at", {21, 32}),
607 std::make_pair(21, 32));
608
609 // From both sides.
610 EXPECT_EQ(classifier->SuggestSelection(
611 "this afternoon !BarackObama,- gave a speech at", {16, 27}),
612 std::make_pair(16, 27));
613}
614
Tony Mak6c4cc672018-09-17 11:48:50 +0100615TEST_P(AnnotatorTest, SuggestSelectionNoCrashWithJunk) {
616 std::unique_ptr<Annotator> classifier =
617 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100618 ASSERT_TRUE(classifier);
619
620 // Try passing in bunch of invalid selections.
621 EXPECT_EQ(classifier->SuggestSelection("", {0, 27}), std::make_pair(0, 27));
622 EXPECT_EQ(classifier->SuggestSelection("", {-10, 27}),
623 std::make_pair(-10, 27));
624 EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {0, 27}),
625 std::make_pair(0, 27));
626 EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-30, 300}),
627 std::make_pair(-30, 300));
628 EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-10, -1}),
629 std::make_pair(-10, -1));
630 EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {100, 17}),
631 std::make_pair(100, 17));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200632
633 // Try passing invalid utf8.
634 EXPECT_EQ(classifier->SuggestSelection("\xf0\x9f\x98\x8b\x8b", {-1, -1}),
635 std::make_pair(-1, -1));
636}
637
Tony Mak6c4cc672018-09-17 11:48:50 +0100638TEST_P(AnnotatorTest, SuggestSelectionSelectSpace) {
639 std::unique_ptr<Annotator> classifier =
640 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200641 ASSERT_TRUE(classifier);
642
643 EXPECT_EQ(
644 classifier->SuggestSelection("call me at 857 225 3556 today", {14, 15}),
645 std::make_pair(11, 23));
646 EXPECT_EQ(
647 classifier->SuggestSelection("call me at 857 225 3556 today", {10, 11}),
648 std::make_pair(10, 11));
649 EXPECT_EQ(
650 classifier->SuggestSelection("call me at 857 225 3556 today", {23, 24}),
651 std::make_pair(23, 24));
652 EXPECT_EQ(
653 classifier->SuggestSelection("call me at 857 225 3556, today", {23, 24}),
654 std::make_pair(23, 24));
655 EXPECT_EQ(classifier->SuggestSelection("call me at 857 225 3556, today",
656 {14, 17}),
657 std::make_pair(11, 25));
658 EXPECT_EQ(
659 classifier->SuggestSelection("call me at 857-225 3556, today", {14, 17}),
660 std::make_pair(11, 23));
661 EXPECT_EQ(
662 classifier->SuggestSelection(
663 "let's meet at 350 Third Street Cambridge and go there", {30, 31}),
664 std::make_pair(14, 40));
665 EXPECT_EQ(classifier->SuggestSelection("call me today", {4, 5}),
666 std::make_pair(4, 5));
667 EXPECT_EQ(classifier->SuggestSelection("call me today", {7, 8}),
668 std::make_pair(7, 8));
669
670 // With a punctuation around the selected whitespace.
671 EXPECT_EQ(
672 classifier->SuggestSelection(
673 "let's meet at 350 Third Street, Cambridge and go there", {31, 32}),
674 std::make_pair(14, 41));
675
676 // When all's whitespace, should return the original indices.
677 EXPECT_EQ(classifier->SuggestSelection(" ", {0, 1}),
678 std::make_pair(0, 1));
679 EXPECT_EQ(classifier->SuggestSelection(" ", {0, 3}),
680 std::make_pair(0, 3));
681 EXPECT_EQ(classifier->SuggestSelection(" ", {2, 3}),
682 std::make_pair(2, 3));
683 EXPECT_EQ(classifier->SuggestSelection(" ", {5, 6}),
684 std::make_pair(5, 6));
685}
686
Tony Mak6c4cc672018-09-17 11:48:50 +0100687TEST_F(AnnotatorTest, SnapLeftIfWhitespaceSelection) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200688 UnicodeText text;
689
690 text = UTF8ToUnicodeText("abcd efgh", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100691 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200692 std::make_pair(3, 4));
693 text = UTF8ToUnicodeText("abcd ", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100694 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200695 std::make_pair(3, 4));
696
697 // Nothing on the left.
698 text = UTF8ToUnicodeText(" efgh", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100699 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200700 std::make_pair(4, 5));
701 text = UTF8ToUnicodeText(" efgh", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100702 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({0, 1}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200703 std::make_pair(0, 1));
704
705 // Whitespace only.
706 text = UTF8ToUnicodeText(" ", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100707 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({2, 3}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200708 std::make_pair(2, 3));
709 text = UTF8ToUnicodeText(" ", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100710 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200711 std::make_pair(4, 5));
712 text = UTF8ToUnicodeText(" ", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100713 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({0, 1}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200714 std::make_pair(0, 1));
Lukas Zilka21d8c982018-01-24 11:11:20 +0100715}
716
Tony Mak6c4cc672018-09-17 11:48:50 +0100717TEST_P(AnnotatorTest, Annotate) {
718 std::unique_ptr<Annotator> classifier =
719 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100720 ASSERT_TRUE(classifier);
721
722 const std::string test_string =
Lukas Zilkab23e2122018-02-09 10:25:19 +0100723 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
724 "number is 853 225 3556";
Lukas Zilka21d8c982018-01-24 11:11:20 +0100725 EXPECT_THAT(classifier->Annotate(test_string),
726 ElementsAreArray({
Lukas Zilkab23e2122018-02-09 10:25:19 +0100727 IsAnnotatedSpan(28, 55, "address"),
728 IsAnnotatedSpan(79, 91, "phone"),
Lukas Zilka21d8c982018-01-24 11:11:20 +0100729 }));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100730
731 AnnotationOptions options;
732 EXPECT_THAT(classifier->Annotate("853 225 3556", options),
733 ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")}));
734 EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty());
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200735
736 // Try passing invalid utf8.
737 EXPECT_TRUE(
738 classifier->Annotate("853 225 3556\n\xf0\x9f\x98\x8b\x8b", options)
739 .empty());
Lukas Zilka21d8c982018-01-24 11:11:20 +0100740}
741
Tony Maka0f598b2018-11-20 20:39:04 +0000742
Tony Mak6c4cc672018-09-17 11:48:50 +0100743TEST_P(AnnotatorTest, AnnotateSmallBatches) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100744 const std::string test_model = ReadFile(GetModelPath() + GetParam());
745 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
746
747 // Set the batch size.
748 unpacked_model->selection_options->batch_size = 4;
749 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000750 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100751
Tony Mak6c4cc672018-09-17 11:48:50 +0100752 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
753 reinterpret_cast<const char*>(builder.GetBufferPointer()),
754 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100755 ASSERT_TRUE(classifier);
756
757 const std::string test_string =
758 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
759 "number is 853 225 3556";
760 EXPECT_THAT(classifier->Annotate(test_string),
761 ElementsAreArray({
Lukas Zilkab23e2122018-02-09 10:25:19 +0100762 IsAnnotatedSpan(28, 55, "address"),
763 IsAnnotatedSpan(79, 91, "phone"),
764 }));
765
766 AnnotationOptions options;
767 EXPECT_THAT(classifier->Annotate("853 225 3556", options),
768 ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")}));
769 EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty());
770}
771
Tony Maka0f598b2018-11-20 20:39:04 +0000772#ifdef TC3_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100773TEST_P(AnnotatorTest, AnnotateFilteringDiscardAll) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100774 const std::string test_model = ReadFile(GetModelPath() + GetParam());
775 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
776
Lukas Zilkab23e2122018-02-09 10:25:19 +0100777 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100778 // Add test threshold.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100779 unpacked_model->triggering_options->min_annotate_confidence =
780 2.f; // Discards all results.
781 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000782 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100783
Tony Mak6c4cc672018-09-17 11:48:50 +0100784 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
785 reinterpret_cast<const char*>(builder.GetBufferPointer()),
786 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100787 ASSERT_TRUE(classifier);
788
789 const std::string test_string =
790 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
791 "number is 853 225 3556";
Lukas Zilkaba849e72018-03-08 14:48:21 +0100792
Tony Mak6c4cc672018-09-17 11:48:50 +0100793 EXPECT_EQ(classifier->Annotate(test_string).size(), 0);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100794}
Tony Maka0f598b2018-11-20 20:39:04 +0000795#endif // TC3_UNILIB_ICU
Lukas Zilkab23e2122018-02-09 10:25:19 +0100796
Tony Mak6c4cc672018-09-17 11:48:50 +0100797TEST_P(AnnotatorTest, AnnotateFilteringKeepAll) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100798 const std::string test_model = ReadFile(GetModelPath() + GetParam());
799 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
800
801 // Add test thresholds.
802 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
803 unpacked_model->triggering_options->min_annotate_confidence =
804 0.f; // Keeps all results.
Lukas Zilkaba849e72018-03-08 14:48:21 +0100805 unpacked_model->triggering_options->enabled_modes = ModeFlag_ALL;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100806 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000807 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100808
Tony Mak6c4cc672018-09-17 11:48:50 +0100809 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
810 reinterpret_cast<const char*>(builder.GetBufferPointer()),
811 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100812 ASSERT_TRUE(classifier);
813
814 const std::string test_string =
815 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
816 "number is 853 225 3556";
Lukas Zilkab23e2122018-02-09 10:25:19 +0100817 EXPECT_EQ(classifier->Annotate(test_string).size(), 2);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100818}
819
Tony Mak6c4cc672018-09-17 11:48:50 +0100820TEST_P(AnnotatorTest, AnnotateDisabled) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100821 const std::string test_model = ReadFile(GetModelPath() + GetParam());
822 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
823
824 // Disable the model for annotation.
825 unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION_AND_SELECTION;
826 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000827 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100828
Tony Mak6c4cc672018-09-17 11:48:50 +0100829 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
830 reinterpret_cast<const char*>(builder.GetBufferPointer()),
831 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100832 ASSERT_TRUE(classifier);
833 const std::string test_string =
834 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
835 "number is 853 225 3556";
836 EXPECT_THAT(classifier->Annotate(test_string), IsEmpty());
837}
838
Tony Mak6c4cc672018-09-17 11:48:50 +0100839TEST_P(AnnotatorTest, AnnotateFilteredCollections) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200840 const std::string test_model = ReadFile(GetModelPath() + GetParam());
841
Tony Mak6c4cc672018-09-17 11:48:50 +0100842 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
843 test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200844 ASSERT_TRUE(classifier);
845
846 const std::string test_string =
847 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
848 "number is 853 225 3556";
849
850 EXPECT_THAT(classifier->Annotate(test_string),
851 ElementsAreArray({
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200852 IsAnnotatedSpan(28, 55, "address"),
853 IsAnnotatedSpan(79, 91, "phone"),
854 }));
855
856 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
857 unpacked_model->output_options.reset(new OutputOptionsT);
858
859 // Disable phone annotation
860 unpacked_model->output_options->filtered_collections_annotation.push_back(
861 "phone");
862
863 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000864 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200865
Tony Mak6c4cc672018-09-17 11:48:50 +0100866 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200867 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100868 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200869 ASSERT_TRUE(classifier);
870
871 EXPECT_THAT(classifier->Annotate(test_string),
872 ElementsAreArray({
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200873 IsAnnotatedSpan(28, 55, "address"),
874 }));
875}
876
Tony Maka0f598b2018-11-20 20:39:04 +0000877#ifdef TC3_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100878TEST_P(AnnotatorTest, AnnotateFilteredCollectionsSuppress) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200879 const std::string test_model = ReadFile(GetModelPath() + GetParam());
880
Tony Mak6c4cc672018-09-17 11:48:50 +0100881 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
882 test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200883 ASSERT_TRUE(classifier);
884
885 const std::string test_string =
886 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
887 "number is 853 225 3556";
888
889 EXPECT_THAT(classifier->Annotate(test_string),
890 ElementsAreArray({
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200891 IsAnnotatedSpan(28, 55, "address"),
892 IsAnnotatedSpan(79, 91, "phone"),
893 }));
894
895 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
896 unpacked_model->output_options.reset(new OutputOptionsT);
897
898 // We add a custom annotator that wins against the phone classification
899 // below and that we subsequently suppress.
900 unpacked_model->output_options->filtered_collections_annotation.push_back(
901 "suppress");
902
903 unpacked_model->regex_model->patterns.push_back(MakePattern(
904 "suppress", "(\\d{3} ?\\d{4})",
905 /*enabled_for_classification=*/false,
906 /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 2.0));
907
908 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000909 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200910
Tony Mak6c4cc672018-09-17 11:48:50 +0100911 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200912 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100913 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200914 ASSERT_TRUE(classifier);
915
916 EXPECT_THAT(classifier->Annotate(test_string),
917 ElementsAreArray({
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200918 IsAnnotatedSpan(28, 55, "address"),
919 }));
920}
Tony Maka0f598b2018-11-20 20:39:04 +0000921#endif // TC3_UNILIB_ICU
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200922
Tony Maka0f598b2018-11-20 20:39:04 +0000923#ifdef TC3_CALENDAR_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100924TEST_P(AnnotatorTest, ClassifyTextDate) {
925 std::unique_ptr<Annotator> classifier =
926 Annotator::FromPath(GetModelPath() + GetParam());
Lukas Zilkab23e2122018-02-09 10:25:19 +0100927 EXPECT_TRUE(classifier);
928
929 std::vector<ClassificationResult> result;
930 ClassificationOptions options;
931
932 options.reference_timezone = "Europe/Zurich";
933 result = classifier->ClassifyText("january 1, 2017", {0, 15}, options);
934
935 ASSERT_EQ(result.size(), 1);
936 EXPECT_THAT(result[0].collection, "date");
937 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000);
938 EXPECT_EQ(result[0].datetime_parse_result.granularity,
939 DatetimeGranularity::GRANULARITY_DAY);
940 result.clear();
941
942 options.reference_timezone = "America/Los_Angeles";
943 result = classifier->ClassifyText("march 1, 2017", {0, 13}, options);
944 ASSERT_EQ(result.size(), 1);
945 EXPECT_THAT(result[0].collection, "date");
946 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1488355200000);
947 EXPECT_EQ(result[0].datetime_parse_result.granularity,
948 DatetimeGranularity::GRANULARITY_DAY);
949 result.clear();
950
951 options.reference_timezone = "America/Los_Angeles";
952 result = classifier->ClassifyText("2018/01/01 10:30:20", {0, 19}, options);
953 ASSERT_EQ(result.size(), 1);
954 EXPECT_THAT(result[0].collection, "date");
955 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1514831420000);
956 EXPECT_EQ(result[0].datetime_parse_result.granularity,
957 DatetimeGranularity::GRANULARITY_SECOND);
958 result.clear();
959
960 // Date on another line.
961 options.reference_timezone = "Europe/Zurich";
962 result = classifier->ClassifyText(
963 "hello world this is the first line\n"
964 "january 1, 2017",
965 {35, 50}, options);
966 ASSERT_EQ(result.size(), 1);
967 EXPECT_THAT(result[0].collection, "date");
968 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000);
969 EXPECT_EQ(result[0].datetime_parse_result.granularity,
970 DatetimeGranularity::GRANULARITY_DAY);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100971}
Tony Maka0f598b2018-11-20 20:39:04 +0000972#endif // TC3_UNILIB_ICU
Lukas Zilkaba849e72018-03-08 14:48:21 +0100973
Tony Maka0f598b2018-11-20 20:39:04 +0000974#ifdef TC3_CALENDAR_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100975TEST_P(AnnotatorTest, ClassifyTextDatePriorities) {
976 std::unique_ptr<Annotator> classifier =
977 Annotator::FromPath(GetModelPath() + GetParam());
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200978 EXPECT_TRUE(classifier);
979
980 std::vector<ClassificationResult> result;
981 ClassificationOptions options;
982
983 result.clear();
984 options.reference_timezone = "Europe/Zurich";
985 options.locales = "en-US";
Lukas Zilka434442d2018-04-25 11:38:51 +0200986 result = classifier->ClassifyText("03.05.1970", {0, 10}, options);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200987
988 ASSERT_EQ(result.size(), 1);
989 EXPECT_THAT(result[0].collection, "date");
990 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 5439600000);
991 EXPECT_EQ(result[0].datetime_parse_result.granularity,
992 DatetimeGranularity::GRANULARITY_DAY);
993
994 result.clear();
995 options.reference_timezone = "Europe/Zurich";
Lukas Zilka434442d2018-04-25 11:38:51 +0200996 options.locales = "de";
997 result = classifier->ClassifyText("03.05.1970", {0, 10}, options);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200998
999 ASSERT_EQ(result.size(), 1);
1000 EXPECT_THAT(result[0].collection, "date");
1001 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 10537200000);
1002 EXPECT_EQ(result[0].datetime_parse_result.granularity,
1003 DatetimeGranularity::GRANULARITY_DAY);
1004}
Tony Maka0f598b2018-11-20 20:39:04 +00001005#endif // TC3_UNILIB_ICU
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001006
Tony Maka0f598b2018-11-20 20:39:04 +00001007#ifdef TC3_CALENDAR_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +01001008TEST_P(AnnotatorTest, SuggestTextDateDisabled) {
Lukas Zilkaba849e72018-03-08 14:48:21 +01001009 const std::string test_model = ReadFile(GetModelPath() + GetParam());
1010 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
1011
1012 // Disable the patterns for selection.
1013 for (int i = 0; i < unpacked_model->datetime_model->patterns.size(); i++) {
1014 unpacked_model->datetime_model->patterns[i]->enabled_modes =
1015 ModeFlag_ANNOTATION_AND_CLASSIFICATION;
1016 }
1017 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +00001018 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +01001019
Tony Mak6c4cc672018-09-17 11:48:50 +01001020 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
1021 reinterpret_cast<const char*>(builder.GetBufferPointer()),
1022 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +01001023 ASSERT_TRUE(classifier);
1024 EXPECT_EQ("date",
1025 FirstResult(classifier->ClassifyText("january 1, 2017", {0, 15})));
1026 EXPECT_EQ(classifier->SuggestSelection("january 1, 2017", {0, 7}),
1027 std::make_pair(0, 7));
1028 EXPECT_THAT(classifier->Annotate("january 1, 2017"),
1029 ElementsAreArray({IsAnnotatedSpan(0, 15, "date")}));
1030}
Tony Maka0f598b2018-11-20 20:39:04 +00001031#endif // TC3_UNILIB_ICU
Lukas Zilkab23e2122018-02-09 10:25:19 +01001032
Tony Mak6c4cc672018-09-17 11:48:50 +01001033class TestingAnnotator : public Annotator {
Lukas Zilkab23e2122018-02-09 10:25:19 +01001034 public:
Tony Mak6c4cc672018-09-17 11:48:50 +01001035 TestingAnnotator(const std::string& model, const UniLib* unilib,
1036 const CalendarLib* calendarlib)
1037 : Annotator(ViewModel(model.data(), model.size()), unilib, calendarlib) {}
Lukas Zilkab23e2122018-02-09 10:25:19 +01001038
Tony Mak6c4cc672018-09-17 11:48:50 +01001039 using Annotator::ResolveConflicts;
Lukas Zilkab23e2122018-02-09 10:25:19 +01001040};
1041
1042AnnotatedSpan MakeAnnotatedSpan(CodepointSpan span,
1043 const std::string& collection,
1044 const float score) {
1045 AnnotatedSpan result;
1046 result.span = span;
1047 result.classification.push_back({collection, score});
1048 return result;
1049}
1050
Tony Mak6c4cc672018-09-17 11:48:50 +01001051TEST_F(AnnotatorTest, ResolveConflictsTrivial) {
1052 TestingAnnotator classifier("", &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001053
1054 std::vector<AnnotatedSpan> candidates{
1055 {MakeAnnotatedSpan({0, 1}, "phone", 1.0)}};
1056
1057 std::vector<int> chosen;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001058 classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
Lukas Zilkaba849e72018-03-08 14:48:21 +01001059 /*interpreter_manager=*/nullptr, &chosen);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001060 EXPECT_THAT(chosen, ElementsAreArray({0}));
1061}
1062
Tony Mak6c4cc672018-09-17 11:48:50 +01001063TEST_F(AnnotatorTest, ResolveConflictsSequence) {
1064 TestingAnnotator classifier("", &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001065
1066 std::vector<AnnotatedSpan> candidates{{
1067 MakeAnnotatedSpan({0, 1}, "phone", 1.0),
1068 MakeAnnotatedSpan({1, 2}, "phone", 1.0),
1069 MakeAnnotatedSpan({2, 3}, "phone", 1.0),
1070 MakeAnnotatedSpan({3, 4}, "phone", 1.0),
1071 MakeAnnotatedSpan({4, 5}, "phone", 1.0),
1072 }};
1073
1074 std::vector<int> chosen;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001075 classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
Lukas Zilkaba849e72018-03-08 14:48:21 +01001076 /*interpreter_manager=*/nullptr, &chosen);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001077 EXPECT_THAT(chosen, ElementsAreArray({0, 1, 2, 3, 4}));
1078}
1079
Tony Mak6c4cc672018-09-17 11:48:50 +01001080TEST_F(AnnotatorTest, ResolveConflictsThreeSpans) {
1081 TestingAnnotator classifier("", &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001082
1083 std::vector<AnnotatedSpan> candidates{{
1084 MakeAnnotatedSpan({0, 3}, "phone", 1.0),
1085 MakeAnnotatedSpan({1, 5}, "phone", 0.5), // Looser!
1086 MakeAnnotatedSpan({3, 7}, "phone", 1.0),
1087 }};
1088
1089 std::vector<int> chosen;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001090 classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
Lukas Zilkaba849e72018-03-08 14:48:21 +01001091 /*interpreter_manager=*/nullptr, &chosen);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001092 EXPECT_THAT(chosen, ElementsAreArray({0, 2}));
1093}
1094
Tony Mak6c4cc672018-09-17 11:48:50 +01001095TEST_F(AnnotatorTest, ResolveConflictsThreeSpansReversed) {
1096 TestingAnnotator classifier("", &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001097
1098 std::vector<AnnotatedSpan> candidates{{
1099 MakeAnnotatedSpan({0, 3}, "phone", 0.5), // Looser!
1100 MakeAnnotatedSpan({1, 5}, "phone", 1.0),
1101 MakeAnnotatedSpan({3, 7}, "phone", 0.6), // Looser!
1102 }};
1103
1104 std::vector<int> chosen;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001105 classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
Lukas Zilkaba849e72018-03-08 14:48:21 +01001106 /*interpreter_manager=*/nullptr, &chosen);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001107 EXPECT_THAT(chosen, ElementsAreArray({1}));
1108}
1109
Tony Mak6c4cc672018-09-17 11:48:50 +01001110TEST_F(AnnotatorTest, ResolveConflictsFiveSpans) {
1111 TestingAnnotator classifier("", &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001112
1113 std::vector<AnnotatedSpan> candidates{{
1114 MakeAnnotatedSpan({0, 3}, "phone", 0.5),
1115 MakeAnnotatedSpan({1, 5}, "other", 1.0), // Looser!
1116 MakeAnnotatedSpan({3, 7}, "phone", 0.6),
1117 MakeAnnotatedSpan({8, 12}, "phone", 0.6), // Looser!
1118 MakeAnnotatedSpan({11, 15}, "phone", 0.9),
1119 }};
1120
1121 std::vector<int> chosen;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001122 classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
Lukas Zilkaba849e72018-03-08 14:48:21 +01001123 /*interpreter_manager=*/nullptr, &chosen);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001124 EXPECT_THAT(chosen, ElementsAreArray({0, 2, 4}));
1125}
Lukas Zilka21d8c982018-01-24 11:11:20 +01001126
Tony Maka0f598b2018-11-20 20:39:04 +00001127#ifdef TC3_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +01001128TEST_P(AnnotatorTest, LongInput) {
1129 std::unique_ptr<Annotator> classifier =
1130 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilkadf710db2018-02-27 12:44:09 +01001131 ASSERT_TRUE(classifier);
1132
1133 for (const auto& type_value_pair :
1134 std::vector<std::pair<std::string, std::string>>{
1135 {"address", "350 Third Street, Cambridge"},
1136 {"phone", "123 456-7890"},
1137 {"url", "www.google.com"},
1138 {"email", "someone@gmail.com"},
1139 {"flight", "LX 38"},
1140 {"date", "September 1, 2018"}}) {
1141 const std::string input_100k = std::string(50000, ' ') +
1142 type_value_pair.second +
1143 std::string(50000, ' ');
1144 const int value_length = type_value_pair.second.size();
1145
1146 EXPECT_THAT(classifier->Annotate(input_100k),
1147 ElementsAreArray({IsAnnotatedSpan(50000, 50000 + value_length,
1148 type_value_pair.first)}));
1149 EXPECT_EQ(classifier->SuggestSelection(input_100k, {50000, 50001}),
1150 std::make_pair(50000, 50000 + value_length));
1151 EXPECT_EQ(type_value_pair.first,
1152 FirstResult(classifier->ClassifyText(
1153 input_100k, {50000, 50000 + value_length})));
1154 }
1155}
Tony Maka0f598b2018-11-20 20:39:04 +00001156#endif // TC3_UNILIB_ICU
Lukas Zilkadf710db2018-02-27 12:44:09 +01001157
Tony Maka0f598b2018-11-20 20:39:04 +00001158#ifdef TC3_UNILIB_ICU
Lukas Zilkaba849e72018-03-08 14:48:21 +01001159// These coarse tests are there only to make sure the execution happens in
1160// reasonable amount of time.
Tony Mak6c4cc672018-09-17 11:48:50 +01001161TEST_P(AnnotatorTest, LongInputNoResultCheck) {
1162 std::unique_ptr<Annotator> classifier =
1163 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +01001164 ASSERT_TRUE(classifier);
1165
1166 for (const std::string& value :
1167 std::vector<std::string>{"http://www.aaaaaaaaaaaaaaaaaaaa.com "}) {
1168 const std::string input_100k =
1169 std::string(50000, ' ') + value + std::string(50000, ' ');
1170 const int value_length = value.size();
1171
1172 classifier->Annotate(input_100k);
1173 classifier->SuggestSelection(input_100k, {50000, 50001});
1174 classifier->ClassifyText(input_100k, {50000, 50000 + value_length});
1175 }
1176}
Tony Maka0f598b2018-11-20 20:39:04 +00001177#endif // TC3_UNILIB_ICU
Lukas Zilkaba849e72018-03-08 14:48:21 +01001178
Tony Maka0f598b2018-11-20 20:39:04 +00001179#ifdef TC3_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +01001180TEST_P(AnnotatorTest, MaxTokenLength) {
Lukas Zilka434442d2018-04-25 11:38:51 +02001181 const std::string test_model = ReadFile(GetModelPath() + GetParam());
1182 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
1183
Tony Mak6c4cc672018-09-17 11:48:50 +01001184 std::unique_ptr<Annotator> classifier;
Lukas Zilka434442d2018-04-25 11:38:51 +02001185
1186 // With unrestricted number of tokens should behave normally.
1187 unpacked_model->classification_options->max_num_tokens = -1;
1188
1189 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +00001190 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Tony Mak6c4cc672018-09-17 11:48:50 +01001191 classifier = Annotator::FromUnownedBuffer(
Lukas Zilka434442d2018-04-25 11:38:51 +02001192 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +01001193 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilka434442d2018-04-25 11:38:51 +02001194 ASSERT_TRUE(classifier);
1195
1196 EXPECT_EQ(FirstResult(classifier->ClassifyText(
1197 "I live at 350 Third Street, Cambridge.", {10, 37})),
1198 "address");
1199
1200 // Raise the maximum number of tokens to suppress the classification.
1201 unpacked_model->classification_options->max_num_tokens = 3;
1202
1203 flatbuffers::FlatBufferBuilder builder2;
Tony Mak51a9e542018-11-02 13:36:22 +00001204 FinishModelBuffer(builder2, Model::Pack(builder2, unpacked_model.get()));
Tony Mak6c4cc672018-09-17 11:48:50 +01001205 classifier = Annotator::FromUnownedBuffer(
Lukas Zilka434442d2018-04-25 11:38:51 +02001206 reinterpret_cast<const char*>(builder2.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +01001207 builder2.GetSize(), &unilib_, &calendarlib_);
Lukas Zilka434442d2018-04-25 11:38:51 +02001208 ASSERT_TRUE(classifier);
1209
1210 EXPECT_EQ(FirstResult(classifier->ClassifyText(
1211 "I live at 350 Third Street, Cambridge.", {10, 37})),
1212 "other");
1213}
Tony Maka0f598b2018-11-20 20:39:04 +00001214#endif // TC3_UNILIB_ICU
Lukas Zilka434442d2018-04-25 11:38:51 +02001215
Tony Maka0f598b2018-11-20 20:39:04 +00001216#ifdef TC3_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +01001217TEST_P(AnnotatorTest, MinAddressTokenLength) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001218 const std::string test_model = ReadFile(GetModelPath() + GetParam());
1219 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
1220
Tony Mak6c4cc672018-09-17 11:48:50 +01001221 std::unique_ptr<Annotator> classifier;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001222
1223 // With unrestricted number of address tokens should behave normally.
1224 unpacked_model->classification_options->address_min_num_tokens = 0;
1225
1226 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +00001227 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Tony Mak6c4cc672018-09-17 11:48:50 +01001228 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001229 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +01001230 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001231 ASSERT_TRUE(classifier);
1232
1233 EXPECT_EQ(FirstResult(classifier->ClassifyText(
1234 "I live at 350 Third Street, Cambridge.", {10, 37})),
1235 "address");
1236
1237 // Raise number of address tokens to suppress the address classification.
1238 unpacked_model->classification_options->address_min_num_tokens = 5;
1239
1240 flatbuffers::FlatBufferBuilder builder2;
Tony Mak51a9e542018-11-02 13:36:22 +00001241 FinishModelBuffer(builder2, Model::Pack(builder2, unpacked_model.get()));
Tony Mak6c4cc672018-09-17 11:48:50 +01001242 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001243 reinterpret_cast<const char*>(builder2.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +01001244 builder2.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001245 ASSERT_TRUE(classifier);
1246
1247 EXPECT_EQ(FirstResult(classifier->ClassifyText(
1248 "I live at 350 Third Street, Cambridge.", {10, 37})),
1249 "other");
1250}
Tony Maka0f598b2018-11-20 20:39:04 +00001251#endif // TC3_UNILIB_ICU
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001252
Lukas Zilka21d8c982018-01-24 11:11:20 +01001253} // namespace
Tony Mak6c4cc672018-09-17 11:48:50 +01001254} // namespace libtextclassifier3