blob: b6290d5201a3a21435703252778ffa3da98a1a43 [file] [log] [blame]
Lukas Zilka21d8c982018-01-24 11:11:20 +01001/*
Tony Mak6c4cc672018-09-17 11:48:50 +01002 * Copyright (C) 2018 The Android Open Source Project
Lukas Zilka21d8c982018-01-24 11:11:20 +01003 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Tony Mak6c4cc672018-09-17 11:48:50 +010017#include "annotator/annotator.h"
Lukas Zilka21d8c982018-01-24 11:11:20 +010018
19#include <fstream>
20#include <iostream>
21#include <memory>
22#include <string>
23
Tony Mak6c4cc672018-09-17 11:48:50 +010024#include "annotator/model_generated.h"
25#include "annotator/types-test-util.h"
Lukas Zilka21d8c982018-01-24 11:11:20 +010026#include "gmock/gmock.h"
27#include "gtest/gtest.h"
28
Tony Mak6c4cc672018-09-17 11:48:50 +010029namespace libtextclassifier3 {
Lukas Zilka21d8c982018-01-24 11:11:20 +010030namespace {
31
32using testing::ElementsAreArray;
Lukas Zilkaba849e72018-03-08 14:48:21 +010033using testing::IsEmpty;
Lukas Zilka21d8c982018-01-24 11:11:20 +010034using testing::Pair;
Lukas Zilkab23e2122018-02-09 10:25:19 +010035using testing::Values;
Lukas Zilka21d8c982018-01-24 11:11:20 +010036
Lukas Zilkab23e2122018-02-09 10:25:19 +010037std::string FirstResult(const std::vector<ClassificationResult>& results) {
Lukas Zilka21d8c982018-01-24 11:11:20 +010038 if (results.empty()) {
39 return "<INVALID RESULTS>";
40 }
Lukas Zilkab23e2122018-02-09 10:25:19 +010041 return results[0].collection;
Lukas Zilka21d8c982018-01-24 11:11:20 +010042}
43
44MATCHER_P3(IsAnnotatedSpan, start, end, best_class, "") {
45 return testing::Value(arg.span, Pair(start, end)) &&
46 testing::Value(FirstResult(arg.classification), best_class);
47}
48
49std::string ReadFile(const std::string& file_name) {
50 std::ifstream file_stream(file_name);
51 return std::string(std::istreambuf_iterator<char>(file_stream), {});
52}
53
54std::string GetModelPath() {
55 return LIBTEXTCLASSIFIER_TEST_DATA_DIR;
56}
57
Tony Mak6c4cc672018-09-17 11:48:50 +010058class AnnotatorTest : public ::testing::TestWithParam<const char*> {
59 protected:
60 AnnotatorTest()
61 : INIT_UNILIB_FOR_TESTING(unilib_),
62 INIT_CALENDARLIB_FOR_TESTING(calendarlib_) {}
63 UniLib unilib_;
64 CalendarLib calendarlib_;
65};
66
67TEST_F(AnnotatorTest, EmbeddingExecutorLoadingFails) {
68 std::unique_ptr<Annotator> classifier = Annotator::FromPath(
69 GetModelPath() + "wrong_embeddings.fb", &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +010070 EXPECT_FALSE(classifier);
71}
72
Tony Mak6c4cc672018-09-17 11:48:50 +010073INSTANTIATE_TEST_CASE_P(ClickContext, AnnotatorTest,
Lukas Zilkab23e2122018-02-09 10:25:19 +010074 Values("test_model_cc.fb"));
Tony Mak6c4cc672018-09-17 11:48:50 +010075INSTANTIATE_TEST_CASE_P(BoundsSensitive, AnnotatorTest,
Lukas Zilkab23e2122018-02-09 10:25:19 +010076 Values("test_model.fb"));
77
Tony Mak6c4cc672018-09-17 11:48:50 +010078TEST_P(AnnotatorTest, ClassifyText) {
79 std::unique_ptr<Annotator> classifier =
80 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +010081 ASSERT_TRUE(classifier);
82
83 EXPECT_EQ("other",
84 FirstResult(classifier->ClassifyText(
85 "this afternoon Barack Obama gave a speech at", {15, 27})));
Lukas Zilka21d8c982018-01-24 11:11:20 +010086 EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
87 "Call me at (800) 123-456 today", {11, 24})));
Lukas Zilka21d8c982018-01-24 11:11:20 +010088
89 // More lines.
90 EXPECT_EQ("other",
91 FirstResult(classifier->ClassifyText(
92 "this afternoon Barack Obama gave a speech at|Visit "
93 "www.google.com every today!|Call me at (800) 123-456 today.",
94 {15, 27})));
Lukas Zilka21d8c982018-01-24 11:11:20 +010095 EXPECT_EQ("phone",
96 FirstResult(classifier->ClassifyText(
97 "this afternoon Barack Obama gave a speech at|Visit "
98 "www.google.com every today!|Call me at (800) 123-456 today.",
99 {90, 103})));
100
101 // Single word.
102 EXPECT_EQ("other", FirstResult(classifier->ClassifyText("obama", {0, 5})));
103 EXPECT_EQ("other", FirstResult(classifier->ClassifyText("asdf", {0, 4})));
104 EXPECT_EQ("<INVALID RESULTS>",
105 FirstResult(classifier->ClassifyText("asdf", {0, 0})));
106
107 // Junk.
108 EXPECT_EQ("<INVALID RESULTS>",
109 FirstResult(classifier->ClassifyText("", {0, 0})));
110 EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText(
111 "a\n\n\n\nx x x\n\n\n\n\n\n", {1, 5})));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200112 // Test invalid utf8 input.
113 EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText(
114 "\xf0\x9f\x98\x8b\x8b", {0, 0})));
Lukas Zilka21d8c982018-01-24 11:11:20 +0100115}
116
Tony Mak6c4cc672018-09-17 11:48:50 +0100117TEST_P(AnnotatorTest, ClassifyTextDisabledFail) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100118 const std::string test_model = ReadFile(GetModelPath() + GetParam());
119 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
120
121 unpacked_model->classification_model.clear();
122 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
123 unpacked_model->triggering_options->enabled_modes = ModeFlag_SELECTION;
124
125 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000126 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100127
Tony Mak6c4cc672018-09-17 11:48:50 +0100128 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
129 reinterpret_cast<const char*>(builder.GetBufferPointer()),
130 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100131
132 // The classification model is still needed for selection scores.
133 ASSERT_FALSE(classifier);
134}
135
Tony Mak6c4cc672018-09-17 11:48:50 +0100136TEST_P(AnnotatorTest, ClassifyTextDisabled) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100137 const std::string test_model = ReadFile(GetModelPath() + GetParam());
138 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
139
140 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
141 unpacked_model->triggering_options->enabled_modes =
142 ModeFlag_ANNOTATION_AND_SELECTION;
143
144 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000145 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100146
Tony Mak6c4cc672018-09-17 11:48:50 +0100147 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
148 reinterpret_cast<const char*>(builder.GetBufferPointer()),
149 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100150 ASSERT_TRUE(classifier);
151
152 EXPECT_THAT(
153 classifier->ClassifyText("Call me at (800) 123-456 today", {11, 24}),
154 IsEmpty());
155}
156
Tony Mak6c4cc672018-09-17 11:48:50 +0100157TEST_P(AnnotatorTest, ClassifyTextFilteredCollections) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200158 const std::string test_model = ReadFile(GetModelPath() + GetParam());
159
Tony Mak6c4cc672018-09-17 11:48:50 +0100160 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
161 test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200162 ASSERT_TRUE(classifier);
163
164 EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
165 "Call me at (800) 123-456 today", {11, 24})));
166
167 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
168 unpacked_model->output_options.reset(new OutputOptionsT);
169
170 // Disable phone classification
171 unpacked_model->output_options->filtered_collections_classification.push_back(
172 "phone");
173
174 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000175 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200176
Tony Mak6c4cc672018-09-17 11:48:50 +0100177 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200178 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100179 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200180 ASSERT_TRUE(classifier);
181
182 EXPECT_EQ("other", FirstResult(classifier->ClassifyText(
183 "Call me at (800) 123-456 today", {11, 24})));
184
185 // Check that the address classification still passes.
186 EXPECT_EQ("address", FirstResult(classifier->ClassifyText(
187 "350 Third Street, Cambridge", {0, 27})));
188}
189
Lukas Zilkab23e2122018-02-09 10:25:19 +0100190std::unique_ptr<RegexModel_::PatternT> MakePattern(
191 const std::string& collection_name, const std::string& pattern,
192 const bool enabled_for_classification, const bool enabled_for_selection,
193 const bool enabled_for_annotation, const float score) {
194 std::unique_ptr<RegexModel_::PatternT> result(new RegexModel_::PatternT);
195 result->collection_name = collection_name;
196 result->pattern = pattern;
Lukas Zilkaba849e72018-03-08 14:48:21 +0100197 // We cannot directly operate with |= on the flag, so use an int here.
198 int enabled_modes = ModeFlag_NONE;
199 if (enabled_for_annotation) enabled_modes |= ModeFlag_ANNOTATION;
200 if (enabled_for_classification) enabled_modes |= ModeFlag_CLASSIFICATION;
201 if (enabled_for_selection) enabled_modes |= ModeFlag_SELECTION;
202 result->enabled_modes = static_cast<ModeFlag>(enabled_modes);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100203 result->target_classification_score = score;
204 result->priority_score = score;
205 return result;
206}
207
208#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100209TEST_P(AnnotatorTest, ClassifyTextRegularExpression) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100210 const std::string test_model = ReadFile(GetModelPath() + GetParam());
211 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
212
213 // Add test regex models.
214 unpacked_model->regex_model->patterns.push_back(MakePattern(
215 "person", "Barack Obama", /*enabled_for_classification=*/true,
216 /*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 1.0));
217 unpacked_model->regex_model->patterns.push_back(MakePattern(
218 "flight", "[a-zA-Z]{2}\\d{2,4}", /*enabled_for_classification=*/true,
219 /*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 0.5));
Tony Mak6c4cc672018-09-17 11:48:50 +0100220 std::unique_ptr<RegexModel_::PatternT> verified_pattern =
221 MakePattern("payment_card", "\\d{4}(?: \\d{4}){3}",
222 /*enabled_for_classification=*/true,
223 /*enabled_for_selection=*/false,
224 /*enabled_for_annotation=*/false, 1.0);
225 verified_pattern->verification_options.reset(new VerificationOptionsT);
226 verified_pattern->verification_options->verify_luhn_checksum = true;
227 unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100228
229 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000230 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100231
Tony Mak6c4cc672018-09-17 11:48:50 +0100232 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
233 reinterpret_cast<const char*>(builder.GetBufferPointer()),
234 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100235 ASSERT_TRUE(classifier);
236
237 EXPECT_EQ("flight",
238 FirstResult(classifier->ClassifyText(
239 "Your flight LX373 is delayed by 3 hours.", {12, 17})));
240 EXPECT_EQ("person",
241 FirstResult(classifier->ClassifyText(
242 "this afternoon Barack Obama gave a speech at", {15, 27})));
243 EXPECT_EQ("email",
244 FirstResult(classifier->ClassifyText("you@android.com", {0, 15})));
245 EXPECT_EQ("email", FirstResult(classifier->ClassifyText(
246 "Contact me at you@android.com", {14, 29})));
247
248 EXPECT_EQ("url", FirstResult(classifier->ClassifyText(
249 "Visit www.google.com every today!", {6, 20})));
250
251 EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("LX 37", {0, 5})));
252 EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("flight LX 37 abcd",
253 {7, 12})));
Tony Mak6c4cc672018-09-17 11:48:50 +0100254 EXPECT_EQ("payment_card", FirstResult(classifier->ClassifyText(
255 "cc: 4012 8888 8888 1881", {4, 23})));
256 EXPECT_EQ("payment_card", FirstResult(classifier->ClassifyText(
257 "2221 0067 4735 6281", {0, 19})));
258 // Luhn check fails.
259 EXPECT_EQ("other", FirstResult(classifier->ClassifyText("2221 0067 4735 6282",
260 {0, 19})));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100261
262 // More lines.
263 EXPECT_EQ("url",
264 FirstResult(classifier->ClassifyText(
265 "this afternoon Barack Obama gave a speech at|Visit "
266 "www.google.com every today!|Call me at (800) 123-456 today.",
267 {51, 65})));
268}
269#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
270
271#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100272TEST_P(AnnotatorTest, SuggestSelectionRegularExpression) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100273 const std::string test_model = ReadFile(GetModelPath() + GetParam());
274 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
275
276 // Add test regex models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100277 unpacked_model->regex_model->patterns.push_back(MakePattern(
278 "person", " (Barack Obama) ", /*enabled_for_classification=*/false,
279 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
280 unpacked_model->regex_model->patterns.push_back(MakePattern(
281 "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
282 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
283 unpacked_model->regex_model->patterns.back()->priority_score = 1.1;
Tony Mak6c4cc672018-09-17 11:48:50 +0100284 std::unique_ptr<RegexModel_::PatternT> verified_pattern =
285 MakePattern("payment_card", "(\\d{4}(?: \\d{4}){3})",
286 /*enabled_for_classification=*/false,
287 /*enabled_for_selection=*/true,
288 /*enabled_for_annotation=*/false, 1.0);
289 verified_pattern->verification_options.reset(new VerificationOptionsT);
290 verified_pattern->verification_options->verify_luhn_checksum = true;
291 unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100292
293 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000294 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100295
Tony Mak6c4cc672018-09-17 11:48:50 +0100296 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
297 reinterpret_cast<const char*>(builder.GetBufferPointer()),
298 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100299 ASSERT_TRUE(classifier);
300
301 // Check regular expression selection.
302 EXPECT_EQ(classifier->SuggestSelection(
303 "Your flight MA 0123 is delayed by 3 hours.", {12, 14}),
304 std::make_pair(12, 19));
305 EXPECT_EQ(classifier->SuggestSelection(
306 "this afternoon Barack Obama gave a speech at", {15, 21}),
307 std::make_pair(15, 27));
Tony Mak6c4cc672018-09-17 11:48:50 +0100308 EXPECT_EQ(classifier->SuggestSelection("cc: 4012 8888 8888 1881", {9, 14}),
309 std::make_pair(4, 23));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100310}
311#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
312
313#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100314TEST_P(AnnotatorTest, SuggestSelectionRegularExpressionConflictsModelWins) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100315 const std::string test_model = ReadFile(GetModelPath() + GetParam());
316 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
317
318 // Add test regex models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100319 unpacked_model->regex_model->patterns.push_back(MakePattern(
320 "person", " (Barack Obama) ", /*enabled_for_classification=*/false,
321 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
322 unpacked_model->regex_model->patterns.push_back(MakePattern(
323 "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
324 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
325 unpacked_model->regex_model->patterns.back()->priority_score = 0.5;
326
327 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000328 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100329
Tony Mak6c4cc672018-09-17 11:48:50 +0100330 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
331 reinterpret_cast<const char*>(builder.GetBufferPointer()),
332 builder.GetSize());
Lukas Zilkab23e2122018-02-09 10:25:19 +0100333 ASSERT_TRUE(classifier);
334
335 // Check conflict resolution.
336 EXPECT_EQ(
337 classifier->SuggestSelection(
338 "saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123",
339 {55, 57}),
340 std::make_pair(26, 62));
341}
342#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
343
344#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100345TEST_P(AnnotatorTest, SuggestSelectionRegularExpressionConflictsRegexWins) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100346 const std::string test_model = ReadFile(GetModelPath() + GetParam());
347 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
348
349 // Add test regex models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100350 unpacked_model->regex_model->patterns.push_back(MakePattern(
351 "person", " (Barack Obama) ", /*enabled_for_classification=*/false,
352 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
353 unpacked_model->regex_model->patterns.push_back(MakePattern(
354 "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
355 /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0));
356 unpacked_model->regex_model->patterns.back()->priority_score = 1.1;
357
358 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000359 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100360
Tony Mak6c4cc672018-09-17 11:48:50 +0100361 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
362 reinterpret_cast<const char*>(builder.GetBufferPointer()),
363 builder.GetSize());
Lukas Zilkab23e2122018-02-09 10:25:19 +0100364 ASSERT_TRUE(classifier);
365
366 // Check conflict resolution.
367 EXPECT_EQ(
368 classifier->SuggestSelection(
369 "saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123",
370 {55, 57}),
371 std::make_pair(55, 62));
372}
373#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
374
375#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100376TEST_P(AnnotatorTest, AnnotateRegex) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100377 const std::string test_model = ReadFile(GetModelPath() + GetParam());
378 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
379
380 // Add test regex models.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100381 unpacked_model->regex_model->patterns.push_back(MakePattern(
382 "person", " (Barack Obama) ", /*enabled_for_classification=*/false,
383 /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 1.0));
384 unpacked_model->regex_model->patterns.push_back(MakePattern(
385 "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false,
386 /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 0.5));
Tony Mak6c4cc672018-09-17 11:48:50 +0100387 std::unique_ptr<RegexModel_::PatternT> verified_pattern =
388 MakePattern("payment_card", "(\\d{4}(?: \\d{4}){3})",
389 /*enabled_for_classification=*/false,
390 /*enabled_for_selection=*/false,
391 /*enabled_for_annotation=*/true, 1.0);
392 verified_pattern->verification_options.reset(new VerificationOptionsT);
393 verified_pattern->verification_options->verify_luhn_checksum = true;
394 unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100395 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000396 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100397
Tony Mak6c4cc672018-09-17 11:48:50 +0100398 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
399 reinterpret_cast<const char*>(builder.GetBufferPointer()),
400 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100401 ASSERT_TRUE(classifier);
402
403 const std::string test_string =
404 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
Tony Mak6c4cc672018-09-17 11:48:50 +0100405 "number is 853 225 3556\nand my card is 4012 8888 8888 1881.\n";
Lukas Zilkab23e2122018-02-09 10:25:19 +0100406 EXPECT_THAT(classifier->Annotate(test_string),
Tony Mak6c4cc672018-09-17 11:48:50 +0100407 ElementsAreArray({IsAnnotatedSpan(6, 18, "person"),
408 IsAnnotatedSpan(28, 55, "address"),
409 IsAnnotatedSpan(79, 91, "phone"),
410 IsAnnotatedSpan(107, 126, "payment_card")}));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100411}
Lukas Zilkab23e2122018-02-09 10:25:19 +0100412#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
413
Tony Mak6c4cc672018-09-17 11:48:50 +0100414TEST_P(AnnotatorTest, PhoneFiltering) {
415 std::unique_ptr<Annotator> classifier =
416 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100417 ASSERT_TRUE(classifier);
418
419 EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
420 "phone: (123) 456 789", {7, 20})));
421 EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
422 "phone: (123) 456 789,0001112", {7, 25})));
423 EXPECT_EQ("other", FirstResult(classifier->ClassifyText(
424 "phone: (123) 456 789,0001112", {7, 28})));
425}
426
Tony Mak6c4cc672018-09-17 11:48:50 +0100427TEST_P(AnnotatorTest, SuggestSelection) {
428 std::unique_ptr<Annotator> classifier =
429 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100430 ASSERT_TRUE(classifier);
431
432 EXPECT_EQ(classifier->SuggestSelection(
433 "this afternoon Barack Obama gave a speech at", {15, 21}),
434 std::make_pair(15, 21));
435
436 // Try passing whole string.
437 // If more than 1 token is specified, we should return back what entered.
438 EXPECT_EQ(
439 classifier->SuggestSelection("350 Third Street, Cambridge", {0, 27}),
440 std::make_pair(0, 27));
441
442 // Single letter.
443 EXPECT_EQ(classifier->SuggestSelection("a", {0, 1}), std::make_pair(0, 1));
444
445 // Single word.
446 EXPECT_EQ(classifier->SuggestSelection("asdf", {0, 4}), std::make_pair(0, 4));
447
448 EXPECT_EQ(
449 classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
450 std::make_pair(11, 23));
451
452 // Unpaired bracket stripping.
453 EXPECT_EQ(
454 classifier->SuggestSelection("call me at (857) 225 3556 today", {11, 16}),
455 std::make_pair(11, 25));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100456 EXPECT_EQ(classifier->SuggestSelection("call me at (857 today", {11, 15}),
457 std::make_pair(12, 15));
458 EXPECT_EQ(classifier->SuggestSelection("call me at 3556) today", {11, 16}),
459 std::make_pair(11, 15));
460 EXPECT_EQ(classifier->SuggestSelection("call me at )857( today", {11, 16}),
461 std::make_pair(12, 15));
Lukas Zilka21d8c982018-01-24 11:11:20 +0100462
463 // If the resulting selection would be empty, the original span is returned.
464 EXPECT_EQ(classifier->SuggestSelection("call me at )( today", {11, 13}),
465 std::make_pair(11, 13));
466 EXPECT_EQ(classifier->SuggestSelection("call me at ( today", {11, 12}),
467 std::make_pair(11, 12));
468 EXPECT_EQ(classifier->SuggestSelection("call me at ) today", {11, 12}),
469 std::make_pair(11, 12));
470}
471
Tony Mak6c4cc672018-09-17 11:48:50 +0100472TEST_P(AnnotatorTest, SuggestSelectionDisabledFail) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100473 const std::string test_model = ReadFile(GetModelPath() + GetParam());
474 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
475
476 // Disable the selection model.
477 unpacked_model->selection_model.clear();
478 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
479 unpacked_model->triggering_options->enabled_modes = ModeFlag_ANNOTATION;
480
481 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000482 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100483
Tony Mak6c4cc672018-09-17 11:48:50 +0100484 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
485 reinterpret_cast<const char*>(builder.GetBufferPointer()),
486 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100487 // Selection model needs to be present for annotation.
488 ASSERT_FALSE(classifier);
489}
490
Tony Mak6c4cc672018-09-17 11:48:50 +0100491TEST_P(AnnotatorTest, SuggestSelectionDisabled) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100492 const std::string test_model = ReadFile(GetModelPath() + GetParam());
493 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
494
495 // Disable the selection model.
496 unpacked_model->selection_model.clear();
497 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
498 unpacked_model->triggering_options->enabled_modes = ModeFlag_CLASSIFICATION;
499 unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION;
500
501 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000502 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100503
Tony Mak6c4cc672018-09-17 11:48:50 +0100504 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
505 reinterpret_cast<const char*>(builder.GetBufferPointer()),
506 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100507 ASSERT_TRUE(classifier);
508
509 EXPECT_EQ(
510 classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
511 std::make_pair(11, 14));
512
513 EXPECT_EQ("phone", FirstResult(classifier->ClassifyText(
514 "call me at (800) 123-456 today", {11, 24})));
515
516 EXPECT_THAT(classifier->Annotate("call me at (800) 123-456 today"),
517 IsEmpty());
518}
519
Tony Mak6c4cc672018-09-17 11:48:50 +0100520TEST_P(AnnotatorTest, SuggestSelectionFilteredCollections) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200521 const std::string test_model = ReadFile(GetModelPath() + GetParam());
522
Tony Mak6c4cc672018-09-17 11:48:50 +0100523 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
524 test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200525 ASSERT_TRUE(classifier);
526
527 EXPECT_EQ(
528 classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
529 std::make_pair(11, 23));
530
531 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
532 unpacked_model->output_options.reset(new OutputOptionsT);
533
534 // Disable phone selection
535 unpacked_model->output_options->filtered_collections_selection.push_back(
536 "phone");
537 // We need to force this for filtering.
538 unpacked_model->selection_options->always_classify_suggested_selection = true;
539
540 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000541 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200542
Tony Mak6c4cc672018-09-17 11:48:50 +0100543 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200544 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100545 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200546 ASSERT_TRUE(classifier);
547
548 EXPECT_EQ(
549 classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}),
550 std::make_pair(11, 14));
551
552 // Address selection should still work.
553 EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}),
554 std::make_pair(0, 27));
555}
556
Tony Mak6c4cc672018-09-17 11:48:50 +0100557TEST_P(AnnotatorTest, SuggestSelectionsAreSymmetric) {
558 std::unique_ptr<Annotator> classifier =
559 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100560 ASSERT_TRUE(classifier);
561
562 EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {0, 3}),
563 std::make_pair(0, 27));
564 EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}),
565 std::make_pair(0, 27));
566 EXPECT_EQ(
567 classifier->SuggestSelection("350 Third Street, Cambridge", {10, 16}),
568 std::make_pair(0, 27));
569 EXPECT_EQ(classifier->SuggestSelection("a\nb\nc\n350 Third Street, Cambridge",
570 {16, 22}),
571 std::make_pair(6, 33));
572}
573
Tony Mak6c4cc672018-09-17 11:48:50 +0100574TEST_P(AnnotatorTest, SuggestSelectionWithNewLine) {
575 std::unique_ptr<Annotator> classifier =
576 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100577 ASSERT_TRUE(classifier);
578
579 EXPECT_EQ(classifier->SuggestSelection("abc\n857 225 3556", {4, 7}),
580 std::make_pair(4, 16));
581 EXPECT_EQ(classifier->SuggestSelection("857 225 3556\nabc", {0, 3}),
582 std::make_pair(0, 12));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100583
584 SelectionOptions options;
585 EXPECT_EQ(classifier->SuggestSelection("857 225\n3556\nabc", {0, 3}, options),
586 std::make_pair(0, 7));
Lukas Zilka21d8c982018-01-24 11:11:20 +0100587}
588
Tony Mak6c4cc672018-09-17 11:48:50 +0100589TEST_P(AnnotatorTest, SuggestSelectionWithPunctuation) {
590 std::unique_ptr<Annotator> classifier =
591 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100592 ASSERT_TRUE(classifier);
593
594 // From the right.
595 EXPECT_EQ(classifier->SuggestSelection(
596 "this afternoon BarackObama, gave a speech at", {15, 26}),
597 std::make_pair(15, 26));
598
599 // From the right multiple.
600 EXPECT_EQ(classifier->SuggestSelection(
601 "this afternoon BarackObama,.,.,, gave a speech at", {15, 26}),
602 std::make_pair(15, 26));
603
604 // From the left multiple.
605 EXPECT_EQ(classifier->SuggestSelection(
606 "this afternoon ,.,.,,BarackObama gave a speech at", {21, 32}),
607 std::make_pair(21, 32));
608
609 // From both sides.
610 EXPECT_EQ(classifier->SuggestSelection(
611 "this afternoon !BarackObama,- gave a speech at", {16, 27}),
612 std::make_pair(16, 27));
613}
614
Tony Mak6c4cc672018-09-17 11:48:50 +0100615TEST_P(AnnotatorTest, SuggestSelectionNoCrashWithJunk) {
616 std::unique_ptr<Annotator> classifier =
617 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100618 ASSERT_TRUE(classifier);
619
620 // Try passing in bunch of invalid selections.
621 EXPECT_EQ(classifier->SuggestSelection("", {0, 27}), std::make_pair(0, 27));
622 EXPECT_EQ(classifier->SuggestSelection("", {-10, 27}),
623 std::make_pair(-10, 27));
624 EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {0, 27}),
625 std::make_pair(0, 27));
626 EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-30, 300}),
627 std::make_pair(-30, 300));
628 EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-10, -1}),
629 std::make_pair(-10, -1));
630 EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {100, 17}),
631 std::make_pair(100, 17));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200632
633 // Try passing invalid utf8.
634 EXPECT_EQ(classifier->SuggestSelection("\xf0\x9f\x98\x8b\x8b", {-1, -1}),
635 std::make_pair(-1, -1));
636}
637
Tony Mak6c4cc672018-09-17 11:48:50 +0100638TEST_P(AnnotatorTest, SuggestSelectionSelectSpace) {
639 std::unique_ptr<Annotator> classifier =
640 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200641 ASSERT_TRUE(classifier);
642
643 EXPECT_EQ(
644 classifier->SuggestSelection("call me at 857 225 3556 today", {14, 15}),
645 std::make_pair(11, 23));
646 EXPECT_EQ(
647 classifier->SuggestSelection("call me at 857 225 3556 today", {10, 11}),
648 std::make_pair(10, 11));
649 EXPECT_EQ(
650 classifier->SuggestSelection("call me at 857 225 3556 today", {23, 24}),
651 std::make_pair(23, 24));
652 EXPECT_EQ(
653 classifier->SuggestSelection("call me at 857 225 3556, today", {23, 24}),
654 std::make_pair(23, 24));
655 EXPECT_EQ(classifier->SuggestSelection("call me at 857 225 3556, today",
656 {14, 17}),
657 std::make_pair(11, 25));
658 EXPECT_EQ(
659 classifier->SuggestSelection("call me at 857-225 3556, today", {14, 17}),
660 std::make_pair(11, 23));
661 EXPECT_EQ(
662 classifier->SuggestSelection(
663 "let's meet at 350 Third Street Cambridge and go there", {30, 31}),
664 std::make_pair(14, 40));
665 EXPECT_EQ(classifier->SuggestSelection("call me today", {4, 5}),
666 std::make_pair(4, 5));
667 EXPECT_EQ(classifier->SuggestSelection("call me today", {7, 8}),
668 std::make_pair(7, 8));
669
670 // With a punctuation around the selected whitespace.
671 EXPECT_EQ(
672 classifier->SuggestSelection(
673 "let's meet at 350 Third Street, Cambridge and go there", {31, 32}),
674 std::make_pair(14, 41));
675
676 // When all's whitespace, should return the original indices.
677 EXPECT_EQ(classifier->SuggestSelection(" ", {0, 1}),
678 std::make_pair(0, 1));
679 EXPECT_EQ(classifier->SuggestSelection(" ", {0, 3}),
680 std::make_pair(0, 3));
681 EXPECT_EQ(classifier->SuggestSelection(" ", {2, 3}),
682 std::make_pair(2, 3));
683 EXPECT_EQ(classifier->SuggestSelection(" ", {5, 6}),
684 std::make_pair(5, 6));
685}
686
Tony Mak6c4cc672018-09-17 11:48:50 +0100687TEST_F(AnnotatorTest, SnapLeftIfWhitespaceSelection) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200688 UnicodeText text;
689
690 text = UTF8ToUnicodeText("abcd efgh", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100691 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200692 std::make_pair(3, 4));
693 text = UTF8ToUnicodeText("abcd ", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100694 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200695 std::make_pair(3, 4));
696
697 // Nothing on the left.
698 text = UTF8ToUnicodeText(" efgh", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100699 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200700 std::make_pair(4, 5));
701 text = UTF8ToUnicodeText(" efgh", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100702 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({0, 1}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200703 std::make_pair(0, 1));
704
705 // Whitespace only.
706 text = UTF8ToUnicodeText(" ", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100707 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({2, 3}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200708 std::make_pair(2, 3));
709 text = UTF8ToUnicodeText(" ", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100710 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200711 std::make_pair(4, 5));
712 text = UTF8ToUnicodeText(" ", /*do_copy=*/false);
Tony Mak6c4cc672018-09-17 11:48:50 +0100713 EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({0, 1}, text, unilib_),
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200714 std::make_pair(0, 1));
Lukas Zilka21d8c982018-01-24 11:11:20 +0100715}
716
Tony Mak6c4cc672018-09-17 11:48:50 +0100717TEST_P(AnnotatorTest, Annotate) {
718 std::unique_ptr<Annotator> classifier =
719 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100720 ASSERT_TRUE(classifier);
721
722 const std::string test_string =
Lukas Zilkab23e2122018-02-09 10:25:19 +0100723 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
724 "number is 853 225 3556";
Lukas Zilka21d8c982018-01-24 11:11:20 +0100725 EXPECT_THAT(classifier->Annotate(test_string),
726 ElementsAreArray({
Lukas Zilkab23e2122018-02-09 10:25:19 +0100727 IsAnnotatedSpan(28, 55, "address"),
728 IsAnnotatedSpan(79, 91, "phone"),
Lukas Zilka21d8c982018-01-24 11:11:20 +0100729 }));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100730
731 AnnotationOptions options;
732 EXPECT_THAT(classifier->Annotate("853 225 3556", options),
733 ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")}));
734 EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty());
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200735
736 // Try passing invalid utf8.
737 EXPECT_TRUE(
738 classifier->Annotate("853 225 3556\n\xf0\x9f\x98\x8b\x8b", options)
739 .empty());
Lukas Zilka21d8c982018-01-24 11:11:20 +0100740}
741
Tony Mak6c4cc672018-09-17 11:48:50 +0100742TEST_P(AnnotatorTest, AnnotateSmallBatches) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100743 const std::string test_model = ReadFile(GetModelPath() + GetParam());
744 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
745
746 // Set the batch size.
747 unpacked_model->selection_options->batch_size = 4;
748 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000749 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100750
Tony Mak6c4cc672018-09-17 11:48:50 +0100751 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
752 reinterpret_cast<const char*>(builder.GetBufferPointer()),
753 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100754 ASSERT_TRUE(classifier);
755
756 const std::string test_string =
757 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
758 "number is 853 225 3556";
759 EXPECT_THAT(classifier->Annotate(test_string),
760 ElementsAreArray({
Lukas Zilkab23e2122018-02-09 10:25:19 +0100761 IsAnnotatedSpan(28, 55, "address"),
762 IsAnnotatedSpan(79, 91, "phone"),
763 }));
764
765 AnnotationOptions options;
766 EXPECT_THAT(classifier->Annotate("853 225 3556", options),
767 ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")}));
768 EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty());
769}
770
Lukas Zilkaba849e72018-03-08 14:48:21 +0100771#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100772TEST_P(AnnotatorTest, AnnotateFilteringDiscardAll) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100773 const std::string test_model = ReadFile(GetModelPath() + GetParam());
774 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
775
Lukas Zilkab23e2122018-02-09 10:25:19 +0100776 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100777 // Add test threshold.
Lukas Zilkab23e2122018-02-09 10:25:19 +0100778 unpacked_model->triggering_options->min_annotate_confidence =
779 2.f; // Discards all results.
780 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000781 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100782
Tony Mak6c4cc672018-09-17 11:48:50 +0100783 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
784 reinterpret_cast<const char*>(builder.GetBufferPointer()),
785 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100786 ASSERT_TRUE(classifier);
787
788 const std::string test_string =
789 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
790 "number is 853 225 3556";
Lukas Zilkaba849e72018-03-08 14:48:21 +0100791
Tony Mak6c4cc672018-09-17 11:48:50 +0100792 EXPECT_EQ(classifier->Annotate(test_string).size(), 0);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100793}
Tony Mak6c4cc672018-09-17 11:48:50 +0100794#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
Lukas Zilkab23e2122018-02-09 10:25:19 +0100795
Tony Mak6c4cc672018-09-17 11:48:50 +0100796TEST_P(AnnotatorTest, AnnotateFilteringKeepAll) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100797 const std::string test_model = ReadFile(GetModelPath() + GetParam());
798 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
799
800 // Add test thresholds.
801 unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT);
802 unpacked_model->triggering_options->min_annotate_confidence =
803 0.f; // Keeps all results.
Lukas Zilkaba849e72018-03-08 14:48:21 +0100804 unpacked_model->triggering_options->enabled_modes = ModeFlag_ALL;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100805 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000806 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100807
Tony Mak6c4cc672018-09-17 11:48:50 +0100808 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
809 reinterpret_cast<const char*>(builder.GetBufferPointer()),
810 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100811 ASSERT_TRUE(classifier);
812
813 const std::string test_string =
814 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
815 "number is 853 225 3556";
Lukas Zilkab23e2122018-02-09 10:25:19 +0100816 EXPECT_EQ(classifier->Annotate(test_string).size(), 2);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100817}
818
Tony Mak6c4cc672018-09-17 11:48:50 +0100819TEST_P(AnnotatorTest, AnnotateDisabled) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100820 const std::string test_model = ReadFile(GetModelPath() + GetParam());
821 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
822
823 // Disable the model for annotation.
824 unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION_AND_SELECTION;
825 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000826 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100827
Tony Mak6c4cc672018-09-17 11:48:50 +0100828 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
829 reinterpret_cast<const char*>(builder.GetBufferPointer()),
830 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100831 ASSERT_TRUE(classifier);
832 const std::string test_string =
833 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
834 "number is 853 225 3556";
835 EXPECT_THAT(classifier->Annotate(test_string), IsEmpty());
836}
837
Tony Mak6c4cc672018-09-17 11:48:50 +0100838TEST_P(AnnotatorTest, AnnotateFilteredCollections) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200839 const std::string test_model = ReadFile(GetModelPath() + GetParam());
840
Tony Mak6c4cc672018-09-17 11:48:50 +0100841 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
842 test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200843 ASSERT_TRUE(classifier);
844
845 const std::string test_string =
846 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
847 "number is 853 225 3556";
848
849 EXPECT_THAT(classifier->Annotate(test_string),
850 ElementsAreArray({
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200851 IsAnnotatedSpan(28, 55, "address"),
852 IsAnnotatedSpan(79, 91, "phone"),
853 }));
854
855 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
856 unpacked_model->output_options.reset(new OutputOptionsT);
857
858 // Disable phone annotation
859 unpacked_model->output_options->filtered_collections_annotation.push_back(
860 "phone");
861
862 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000863 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200864
Tony Mak6c4cc672018-09-17 11:48:50 +0100865 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200866 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100867 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200868 ASSERT_TRUE(classifier);
869
870 EXPECT_THAT(classifier->Annotate(test_string),
871 ElementsAreArray({
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200872 IsAnnotatedSpan(28, 55, "address"),
873 }));
874}
875
876#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100877TEST_P(AnnotatorTest, AnnotateFilteredCollectionsSuppress) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200878 const std::string test_model = ReadFile(GetModelPath() + GetParam());
879
Tony Mak6c4cc672018-09-17 11:48:50 +0100880 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
881 test_model.c_str(), test_model.size(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200882 ASSERT_TRUE(classifier);
883
884 const std::string test_string =
885 "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone "
886 "number is 853 225 3556";
887
888 EXPECT_THAT(classifier->Annotate(test_string),
889 ElementsAreArray({
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200890 IsAnnotatedSpan(28, 55, "address"),
891 IsAnnotatedSpan(79, 91, "phone"),
892 }));
893
894 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
895 unpacked_model->output_options.reset(new OutputOptionsT);
896
897 // We add a custom annotator that wins against the phone classification
898 // below and that we subsequently suppress.
899 unpacked_model->output_options->filtered_collections_annotation.push_back(
900 "suppress");
901
902 unpacked_model->regex_model->patterns.push_back(MakePattern(
903 "suppress", "(\\d{3} ?\\d{4})",
904 /*enabled_for_classification=*/false,
905 /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 2.0));
906
907 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +0000908 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200909
Tony Mak6c4cc672018-09-17 11:48:50 +0100910 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200911 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100912 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200913 ASSERT_TRUE(classifier);
914
915 EXPECT_THAT(classifier->Annotate(test_string),
916 ElementsAreArray({
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200917 IsAnnotatedSpan(28, 55, "address"),
918 }));
919}
Tony Mak6c4cc672018-09-17 11:48:50 +0100920#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200921
Lukas Zilkab23e2122018-02-09 10:25:19 +0100922#ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100923TEST_P(AnnotatorTest, ClassifyTextDate) {
924 std::unique_ptr<Annotator> classifier =
925 Annotator::FromPath(GetModelPath() + GetParam());
Lukas Zilkab23e2122018-02-09 10:25:19 +0100926 EXPECT_TRUE(classifier);
927
928 std::vector<ClassificationResult> result;
929 ClassificationOptions options;
930
931 options.reference_timezone = "Europe/Zurich";
932 result = classifier->ClassifyText("january 1, 2017", {0, 15}, options);
933
934 ASSERT_EQ(result.size(), 1);
935 EXPECT_THAT(result[0].collection, "date");
936 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000);
937 EXPECT_EQ(result[0].datetime_parse_result.granularity,
938 DatetimeGranularity::GRANULARITY_DAY);
939 result.clear();
940
941 options.reference_timezone = "America/Los_Angeles";
942 result = classifier->ClassifyText("march 1, 2017", {0, 13}, options);
943 ASSERT_EQ(result.size(), 1);
944 EXPECT_THAT(result[0].collection, "date");
945 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1488355200000);
946 EXPECT_EQ(result[0].datetime_parse_result.granularity,
947 DatetimeGranularity::GRANULARITY_DAY);
948 result.clear();
949
950 options.reference_timezone = "America/Los_Angeles";
951 result = classifier->ClassifyText("2018/01/01 10:30:20", {0, 19}, options);
952 ASSERT_EQ(result.size(), 1);
953 EXPECT_THAT(result[0].collection, "date");
954 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1514831420000);
955 EXPECT_EQ(result[0].datetime_parse_result.granularity,
956 DatetimeGranularity::GRANULARITY_SECOND);
957 result.clear();
958
959 // Date on another line.
960 options.reference_timezone = "Europe/Zurich";
961 result = classifier->ClassifyText(
962 "hello world this is the first line\n"
963 "january 1, 2017",
964 {35, 50}, options);
965 ASSERT_EQ(result.size(), 1);
966 EXPECT_THAT(result[0].collection, "date");
967 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000);
968 EXPECT_EQ(result[0].datetime_parse_result.granularity,
969 DatetimeGranularity::GRANULARITY_DAY);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100970}
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200971#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
Lukas Zilkaba849e72018-03-08 14:48:21 +0100972
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200973#ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100974TEST_P(AnnotatorTest, ClassifyTextDatePriorities) {
975 std::unique_ptr<Annotator> classifier =
976 Annotator::FromPath(GetModelPath() + GetParam());
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200977 EXPECT_TRUE(classifier);
978
979 std::vector<ClassificationResult> result;
980 ClassificationOptions options;
981
982 result.clear();
983 options.reference_timezone = "Europe/Zurich";
984 options.locales = "en-US";
Lukas Zilka434442d2018-04-25 11:38:51 +0200985 result = classifier->ClassifyText("03.05.1970", {0, 10}, options);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200986
987 ASSERT_EQ(result.size(), 1);
988 EXPECT_THAT(result[0].collection, "date");
989 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 5439600000);
990 EXPECT_EQ(result[0].datetime_parse_result.granularity,
991 DatetimeGranularity::GRANULARITY_DAY);
992
993 result.clear();
994 options.reference_timezone = "Europe/Zurich";
Lukas Zilka434442d2018-04-25 11:38:51 +0200995 options.locales = "de";
996 result = classifier->ClassifyText("03.05.1970", {0, 10}, options);
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200997
998 ASSERT_EQ(result.size(), 1);
999 EXPECT_THAT(result[0].collection, "date");
1000 EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 10537200000);
1001 EXPECT_EQ(result[0].datetime_parse_result.granularity,
1002 DatetimeGranularity::GRANULARITY_DAY);
1003}
1004#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
1005
1006#ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +01001007TEST_P(AnnotatorTest, SuggestTextDateDisabled) {
Lukas Zilkaba849e72018-03-08 14:48:21 +01001008 const std::string test_model = ReadFile(GetModelPath() + GetParam());
1009 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
1010
1011 // Disable the patterns for selection.
1012 for (int i = 0; i < unpacked_model->datetime_model->patterns.size(); i++) {
1013 unpacked_model->datetime_model->patterns[i]->enabled_modes =
1014 ModeFlag_ANNOTATION_AND_CLASSIFICATION;
1015 }
1016 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +00001017 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Lukas Zilkaba849e72018-03-08 14:48:21 +01001018
Tony Mak6c4cc672018-09-17 11:48:50 +01001019 std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer(
1020 reinterpret_cast<const char*>(builder.GetBufferPointer()),
1021 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +01001022 ASSERT_TRUE(classifier);
1023 EXPECT_EQ("date",
1024 FirstResult(classifier->ClassifyText("january 1, 2017", {0, 15})));
1025 EXPECT_EQ(classifier->SuggestSelection("january 1, 2017", {0, 7}),
1026 std::make_pair(0, 7));
1027 EXPECT_THAT(classifier->Annotate("january 1, 2017"),
1028 ElementsAreArray({IsAnnotatedSpan(0, 15, "date")}));
1029}
Lukas Zilkab23e2122018-02-09 10:25:19 +01001030#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
1031
Tony Mak6c4cc672018-09-17 11:48:50 +01001032class TestingAnnotator : public Annotator {
Lukas Zilkab23e2122018-02-09 10:25:19 +01001033 public:
Tony Mak6c4cc672018-09-17 11:48:50 +01001034 TestingAnnotator(const std::string& model, const UniLib* unilib,
1035 const CalendarLib* calendarlib)
1036 : Annotator(ViewModel(model.data(), model.size()), unilib, calendarlib) {}
Lukas Zilkab23e2122018-02-09 10:25:19 +01001037
Tony Mak6c4cc672018-09-17 11:48:50 +01001038 using Annotator::ResolveConflicts;
Lukas Zilkab23e2122018-02-09 10:25:19 +01001039};
1040
1041AnnotatedSpan MakeAnnotatedSpan(CodepointSpan span,
1042 const std::string& collection,
1043 const float score) {
1044 AnnotatedSpan result;
1045 result.span = span;
1046 result.classification.push_back({collection, score});
1047 return result;
1048}
1049
Tony Mak6c4cc672018-09-17 11:48:50 +01001050TEST_F(AnnotatorTest, ResolveConflictsTrivial) {
1051 TestingAnnotator classifier("", &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001052
1053 std::vector<AnnotatedSpan> candidates{
1054 {MakeAnnotatedSpan({0, 1}, "phone", 1.0)}};
1055
1056 std::vector<int> chosen;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001057 classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
Lukas Zilkaba849e72018-03-08 14:48:21 +01001058 /*interpreter_manager=*/nullptr, &chosen);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001059 EXPECT_THAT(chosen, ElementsAreArray({0}));
1060}
1061
Tony Mak6c4cc672018-09-17 11:48:50 +01001062TEST_F(AnnotatorTest, ResolveConflictsSequence) {
1063 TestingAnnotator classifier("", &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001064
1065 std::vector<AnnotatedSpan> candidates{{
1066 MakeAnnotatedSpan({0, 1}, "phone", 1.0),
1067 MakeAnnotatedSpan({1, 2}, "phone", 1.0),
1068 MakeAnnotatedSpan({2, 3}, "phone", 1.0),
1069 MakeAnnotatedSpan({3, 4}, "phone", 1.0),
1070 MakeAnnotatedSpan({4, 5}, "phone", 1.0),
1071 }};
1072
1073 std::vector<int> chosen;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001074 classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
Lukas Zilkaba849e72018-03-08 14:48:21 +01001075 /*interpreter_manager=*/nullptr, &chosen);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001076 EXPECT_THAT(chosen, ElementsAreArray({0, 1, 2, 3, 4}));
1077}
1078
Tony Mak6c4cc672018-09-17 11:48:50 +01001079TEST_F(AnnotatorTest, ResolveConflictsThreeSpans) {
1080 TestingAnnotator classifier("", &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001081
1082 std::vector<AnnotatedSpan> candidates{{
1083 MakeAnnotatedSpan({0, 3}, "phone", 1.0),
1084 MakeAnnotatedSpan({1, 5}, "phone", 0.5), // Looser!
1085 MakeAnnotatedSpan({3, 7}, "phone", 1.0),
1086 }};
1087
1088 std::vector<int> chosen;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001089 classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
Lukas Zilkaba849e72018-03-08 14:48:21 +01001090 /*interpreter_manager=*/nullptr, &chosen);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001091 EXPECT_THAT(chosen, ElementsAreArray({0, 2}));
1092}
1093
Tony Mak6c4cc672018-09-17 11:48:50 +01001094TEST_F(AnnotatorTest, ResolveConflictsThreeSpansReversed) {
1095 TestingAnnotator classifier("", &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001096
1097 std::vector<AnnotatedSpan> candidates{{
1098 MakeAnnotatedSpan({0, 3}, "phone", 0.5), // Looser!
1099 MakeAnnotatedSpan({1, 5}, "phone", 1.0),
1100 MakeAnnotatedSpan({3, 7}, "phone", 0.6), // Looser!
1101 }};
1102
1103 std::vector<int> chosen;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001104 classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
Lukas Zilkaba849e72018-03-08 14:48:21 +01001105 /*interpreter_manager=*/nullptr, &chosen);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001106 EXPECT_THAT(chosen, ElementsAreArray({1}));
1107}
1108
Tony Mak6c4cc672018-09-17 11:48:50 +01001109TEST_F(AnnotatorTest, ResolveConflictsFiveSpans) {
1110 TestingAnnotator classifier("", &unilib_, &calendarlib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001111
1112 std::vector<AnnotatedSpan> candidates{{
1113 MakeAnnotatedSpan({0, 3}, "phone", 0.5),
1114 MakeAnnotatedSpan({1, 5}, "other", 1.0), // Looser!
1115 MakeAnnotatedSpan({3, 7}, "phone", 0.6),
1116 MakeAnnotatedSpan({8, 12}, "phone", 0.6), // Looser!
1117 MakeAnnotatedSpan({11, 15}, "phone", 0.9),
1118 }};
1119
1120 std::vector<int> chosen;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001121 classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{},
Lukas Zilkaba849e72018-03-08 14:48:21 +01001122 /*interpreter_manager=*/nullptr, &chosen);
Lukas Zilkab23e2122018-02-09 10:25:19 +01001123 EXPECT_THAT(chosen, ElementsAreArray({0, 2, 4}));
1124}
Lukas Zilka21d8c982018-01-24 11:11:20 +01001125
Lukas Zilkadf710db2018-02-27 12:44:09 +01001126#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +01001127TEST_P(AnnotatorTest, LongInput) {
1128 std::unique_ptr<Annotator> classifier =
1129 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilkadf710db2018-02-27 12:44:09 +01001130 ASSERT_TRUE(classifier);
1131
1132 for (const auto& type_value_pair :
1133 std::vector<std::pair<std::string, std::string>>{
1134 {"address", "350 Third Street, Cambridge"},
1135 {"phone", "123 456-7890"},
1136 {"url", "www.google.com"},
1137 {"email", "someone@gmail.com"},
1138 {"flight", "LX 38"},
1139 {"date", "September 1, 2018"}}) {
1140 const std::string input_100k = std::string(50000, ' ') +
1141 type_value_pair.second +
1142 std::string(50000, ' ');
1143 const int value_length = type_value_pair.second.size();
1144
1145 EXPECT_THAT(classifier->Annotate(input_100k),
1146 ElementsAreArray({IsAnnotatedSpan(50000, 50000 + value_length,
1147 type_value_pair.first)}));
1148 EXPECT_EQ(classifier->SuggestSelection(input_100k, {50000, 50001}),
1149 std::make_pair(50000, 50000 + value_length));
1150 EXPECT_EQ(type_value_pair.first,
1151 FirstResult(classifier->ClassifyText(
1152 input_100k, {50000, 50000 + value_length})));
1153 }
1154}
1155#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
1156
Lukas Zilkaba849e72018-03-08 14:48:21 +01001157#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
1158// These coarse tests are there only to make sure the execution happens in
1159// reasonable amount of time.
Tony Mak6c4cc672018-09-17 11:48:50 +01001160TEST_P(AnnotatorTest, LongInputNoResultCheck) {
1161 std::unique_ptr<Annotator> classifier =
1162 Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +01001163 ASSERT_TRUE(classifier);
1164
1165 for (const std::string& value :
1166 std::vector<std::string>{"http://www.aaaaaaaaaaaaaaaaaaaa.com "}) {
1167 const std::string input_100k =
1168 std::string(50000, ' ') + value + std::string(50000, ' ');
1169 const int value_length = value.size();
1170
1171 classifier->Annotate(input_100k);
1172 classifier->SuggestSelection(input_100k, {50000, 50001});
1173 classifier->ClassifyText(input_100k, {50000, 50000 + value_length});
1174 }
1175}
1176#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
1177
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001178#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +01001179TEST_P(AnnotatorTest, MaxTokenLength) {
Lukas Zilka434442d2018-04-25 11:38:51 +02001180 const std::string test_model = ReadFile(GetModelPath() + GetParam());
1181 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
1182
Tony Mak6c4cc672018-09-17 11:48:50 +01001183 std::unique_ptr<Annotator> classifier;
Lukas Zilka434442d2018-04-25 11:38:51 +02001184
1185 // With unrestricted number of tokens should behave normally.
1186 unpacked_model->classification_options->max_num_tokens = -1;
1187
1188 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +00001189 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Tony Mak6c4cc672018-09-17 11:48:50 +01001190 classifier = Annotator::FromUnownedBuffer(
Lukas Zilka434442d2018-04-25 11:38:51 +02001191 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +01001192 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilka434442d2018-04-25 11:38:51 +02001193 ASSERT_TRUE(classifier);
1194
1195 EXPECT_EQ(FirstResult(classifier->ClassifyText(
1196 "I live at 350 Third Street, Cambridge.", {10, 37})),
1197 "address");
1198
1199 // Raise the maximum number of tokens to suppress the classification.
1200 unpacked_model->classification_options->max_num_tokens = 3;
1201
1202 flatbuffers::FlatBufferBuilder builder2;
Tony Mak51a9e542018-11-02 13:36:22 +00001203 FinishModelBuffer(builder2, Model::Pack(builder2, unpacked_model.get()));
Tony Mak6c4cc672018-09-17 11:48:50 +01001204 classifier = Annotator::FromUnownedBuffer(
Lukas Zilka434442d2018-04-25 11:38:51 +02001205 reinterpret_cast<const char*>(builder2.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +01001206 builder2.GetSize(), &unilib_, &calendarlib_);
Lukas Zilka434442d2018-04-25 11:38:51 +02001207 ASSERT_TRUE(classifier);
1208
1209 EXPECT_EQ(FirstResult(classifier->ClassifyText(
1210 "I live at 350 Third Street, Cambridge.", {10, 37})),
1211 "other");
1212}
1213#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
1214
1215#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +01001216TEST_P(AnnotatorTest, MinAddressTokenLength) {
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001217 const std::string test_model = ReadFile(GetModelPath() + GetParam());
1218 std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str());
1219
Tony Mak6c4cc672018-09-17 11:48:50 +01001220 std::unique_ptr<Annotator> classifier;
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001221
1222 // With unrestricted number of address tokens should behave normally.
1223 unpacked_model->classification_options->address_min_num_tokens = 0;
1224
1225 flatbuffers::FlatBufferBuilder builder;
Tony Mak51a9e542018-11-02 13:36:22 +00001226 FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get()));
Tony Mak6c4cc672018-09-17 11:48:50 +01001227 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001228 reinterpret_cast<const char*>(builder.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +01001229 builder.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001230 ASSERT_TRUE(classifier);
1231
1232 EXPECT_EQ(FirstResult(classifier->ClassifyText(
1233 "I live at 350 Third Street, Cambridge.", {10, 37})),
1234 "address");
1235
1236 // Raise number of address tokens to suppress the address classification.
1237 unpacked_model->classification_options->address_min_num_tokens = 5;
1238
1239 flatbuffers::FlatBufferBuilder builder2;
Tony Mak51a9e542018-11-02 13:36:22 +00001240 FinishModelBuffer(builder2, Model::Pack(builder2, unpacked_model.get()));
Tony Mak6c4cc672018-09-17 11:48:50 +01001241 classifier = Annotator::FromUnownedBuffer(
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001242 reinterpret_cast<const char*>(builder2.GetBufferPointer()),
Tony Mak6c4cc672018-09-17 11:48:50 +01001243 builder2.GetSize(), &unilib_, &calendarlib_);
Lukas Zilkae7962cc2018-03-28 18:09:48 +02001244 ASSERT_TRUE(classifier);
1245
1246 EXPECT_EQ(FirstResult(classifier->ClassifyText(
1247 "I live at 350 Third Street, Cambridge.", {10, 37})),
1248 "other");
1249}
1250#endif // LIBTEXTCLASSIFIER_UNILIB_ICU
1251
Lukas Zilka21d8c982018-01-24 11:11:20 +01001252} // namespace
Tony Mak6c4cc672018-09-17 11:48:50 +01001253} // namespace libtextclassifier3