Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 1 | /* |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 2 | * Copyright (C) 2018 The Android Open Source Project |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 17 | #include "annotator/annotator.h" |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 18 | |
| 19 | #include <fstream> |
| 20 | #include <iostream> |
| 21 | #include <memory> |
| 22 | #include <string> |
| 23 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 24 | #include "annotator/model_generated.h" |
| 25 | #include "annotator/types-test-util.h" |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 26 | #include "gmock/gmock.h" |
| 27 | #include "gtest/gtest.h" |
| 28 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 29 | namespace libtextclassifier3 { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 30 | namespace { |
| 31 | |
| 32 | using testing::ElementsAreArray; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 33 | using testing::IsEmpty; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 34 | using testing::Pair; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 35 | using testing::Values; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 36 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 37 | std::string FirstResult(const std::vector<ClassificationResult>& results) { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 38 | if (results.empty()) { |
| 39 | return "<INVALID RESULTS>"; |
| 40 | } |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 41 | return results[0].collection; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 42 | } |
| 43 | |
| 44 | MATCHER_P3(IsAnnotatedSpan, start, end, best_class, "") { |
| 45 | return testing::Value(arg.span, Pair(start, end)) && |
| 46 | testing::Value(FirstResult(arg.classification), best_class); |
| 47 | } |
| 48 | |
| 49 | std::string ReadFile(const std::string& file_name) { |
| 50 | std::ifstream file_stream(file_name); |
| 51 | return std::string(std::istreambuf_iterator<char>(file_stream), {}); |
| 52 | } |
| 53 | |
| 54 | std::string GetModelPath() { |
| 55 | return LIBTEXTCLASSIFIER_TEST_DATA_DIR; |
| 56 | } |
| 57 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 58 | class AnnotatorTest : public ::testing::TestWithParam<const char*> { |
| 59 | protected: |
| 60 | AnnotatorTest() |
| 61 | : INIT_UNILIB_FOR_TESTING(unilib_), |
| 62 | INIT_CALENDARLIB_FOR_TESTING(calendarlib_) {} |
| 63 | UniLib unilib_; |
| 64 | CalendarLib calendarlib_; |
| 65 | }; |
| 66 | |
| 67 | TEST_F(AnnotatorTest, EmbeddingExecutorLoadingFails) { |
| 68 | std::unique_ptr<Annotator> classifier = Annotator::FromPath( |
| 69 | GetModelPath() + "wrong_embeddings.fb", &unilib_, &calendarlib_); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 70 | EXPECT_FALSE(classifier); |
| 71 | } |
| 72 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 73 | INSTANTIATE_TEST_CASE_P(ClickContext, AnnotatorTest, |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 74 | Values("test_model_cc.fb")); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 75 | INSTANTIATE_TEST_CASE_P(BoundsSensitive, AnnotatorTest, |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 76 | Values("test_model.fb")); |
| 77 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 78 | TEST_P(AnnotatorTest, ClassifyText) { |
| 79 | std::unique_ptr<Annotator> classifier = |
| 80 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 81 | ASSERT_TRUE(classifier); |
| 82 | |
| 83 | EXPECT_EQ("other", |
| 84 | FirstResult(classifier->ClassifyText( |
| 85 | "this afternoon Barack Obama gave a speech at", {15, 27}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 86 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 87 | "Call me at (800) 123-456 today", {11, 24}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 88 | |
| 89 | // More lines. |
| 90 | EXPECT_EQ("other", |
| 91 | FirstResult(classifier->ClassifyText( |
| 92 | "this afternoon Barack Obama gave a speech at|Visit " |
| 93 | "www.google.com every today!|Call me at (800) 123-456 today.", |
| 94 | {15, 27}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 95 | EXPECT_EQ("phone", |
| 96 | FirstResult(classifier->ClassifyText( |
| 97 | "this afternoon Barack Obama gave a speech at|Visit " |
| 98 | "www.google.com every today!|Call me at (800) 123-456 today.", |
| 99 | {90, 103}))); |
| 100 | |
| 101 | // Single word. |
| 102 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText("obama", {0, 5}))); |
| 103 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText("asdf", {0, 4}))); |
| 104 | EXPECT_EQ("<INVALID RESULTS>", |
| 105 | FirstResult(classifier->ClassifyText("asdf", {0, 0}))); |
| 106 | |
| 107 | // Junk. |
| 108 | EXPECT_EQ("<INVALID RESULTS>", |
| 109 | FirstResult(classifier->ClassifyText("", {0, 0}))); |
| 110 | EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText( |
| 111 | "a\n\n\n\nx x x\n\n\n\n\n\n", {1, 5}))); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 112 | // Test invalid utf8 input. |
| 113 | EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText( |
| 114 | "\xf0\x9f\x98\x8b\x8b", {0, 0}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 115 | } |
| 116 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 117 | TEST_P(AnnotatorTest, ClassifyTextDisabledFail) { |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 118 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 119 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 120 | |
| 121 | unpacked_model->classification_model.clear(); |
| 122 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 123 | unpacked_model->triggering_options->enabled_modes = ModeFlag_SELECTION; |
| 124 | |
| 125 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 126 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 127 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 128 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 129 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 130 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 131 | |
| 132 | // The classification model is still needed for selection scores. |
| 133 | ASSERT_FALSE(classifier); |
| 134 | } |
| 135 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 136 | TEST_P(AnnotatorTest, ClassifyTextDisabled) { |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 137 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 138 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 139 | |
| 140 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 141 | unpacked_model->triggering_options->enabled_modes = |
| 142 | ModeFlag_ANNOTATION_AND_SELECTION; |
| 143 | |
| 144 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 145 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 146 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 147 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 148 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 149 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 150 | ASSERT_TRUE(classifier); |
| 151 | |
| 152 | EXPECT_THAT( |
| 153 | classifier->ClassifyText("Call me at (800) 123-456 today", {11, 24}), |
| 154 | IsEmpty()); |
| 155 | } |
| 156 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 157 | TEST_P(AnnotatorTest, ClassifyTextFilteredCollections) { |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 158 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 159 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 160 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 161 | test_model.c_str(), test_model.size(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 162 | ASSERT_TRUE(classifier); |
| 163 | |
| 164 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 165 | "Call me at (800) 123-456 today", {11, 24}))); |
| 166 | |
| 167 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 168 | unpacked_model->output_options.reset(new OutputOptionsT); |
| 169 | |
| 170 | // Disable phone classification |
| 171 | unpacked_model->output_options->filtered_collections_classification.push_back( |
| 172 | "phone"); |
| 173 | |
| 174 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 175 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 176 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 177 | classifier = Annotator::FromUnownedBuffer( |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 178 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 179 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 180 | ASSERT_TRUE(classifier); |
| 181 | |
| 182 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText( |
| 183 | "Call me at (800) 123-456 today", {11, 24}))); |
| 184 | |
| 185 | // Check that the address classification still passes. |
| 186 | EXPECT_EQ("address", FirstResult(classifier->ClassifyText( |
| 187 | "350 Third Street, Cambridge", {0, 27}))); |
| 188 | } |
| 189 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 190 | std::unique_ptr<RegexModel_::PatternT> MakePattern( |
| 191 | const std::string& collection_name, const std::string& pattern, |
| 192 | const bool enabled_for_classification, const bool enabled_for_selection, |
| 193 | const bool enabled_for_annotation, const float score) { |
| 194 | std::unique_ptr<RegexModel_::PatternT> result(new RegexModel_::PatternT); |
| 195 | result->collection_name = collection_name; |
| 196 | result->pattern = pattern; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 197 | // We cannot directly operate with |= on the flag, so use an int here. |
| 198 | int enabled_modes = ModeFlag_NONE; |
| 199 | if (enabled_for_annotation) enabled_modes |= ModeFlag_ANNOTATION; |
| 200 | if (enabled_for_classification) enabled_modes |= ModeFlag_CLASSIFICATION; |
| 201 | if (enabled_for_selection) enabled_modes |= ModeFlag_SELECTION; |
| 202 | result->enabled_modes = static_cast<ModeFlag>(enabled_modes); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 203 | result->target_classification_score = score; |
| 204 | result->priority_score = score; |
| 205 | return result; |
| 206 | } |
| 207 | |
| 208 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 209 | TEST_P(AnnotatorTest, ClassifyTextRegularExpression) { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 210 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 211 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 212 | |
| 213 | // Add test regex models. |
| 214 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 215 | "person", "Barack Obama", /*enabled_for_classification=*/true, |
| 216 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 1.0)); |
| 217 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 218 | "flight", "[a-zA-Z]{2}\\d{2,4}", /*enabled_for_classification=*/true, |
| 219 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 0.5)); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 220 | std::unique_ptr<RegexModel_::PatternT> verified_pattern = |
| 221 | MakePattern("payment_card", "\\d{4}(?: \\d{4}){3}", |
| 222 | /*enabled_for_classification=*/true, |
| 223 | /*enabled_for_selection=*/false, |
| 224 | /*enabled_for_annotation=*/false, 1.0); |
| 225 | verified_pattern->verification_options.reset(new VerificationOptionsT); |
| 226 | verified_pattern->verification_options->verify_luhn_checksum = true; |
| 227 | unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 228 | |
| 229 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 230 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 231 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 232 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 233 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 234 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 235 | ASSERT_TRUE(classifier); |
| 236 | |
| 237 | EXPECT_EQ("flight", |
| 238 | FirstResult(classifier->ClassifyText( |
| 239 | "Your flight LX373 is delayed by 3 hours.", {12, 17}))); |
| 240 | EXPECT_EQ("person", |
| 241 | FirstResult(classifier->ClassifyText( |
| 242 | "this afternoon Barack Obama gave a speech at", {15, 27}))); |
| 243 | EXPECT_EQ("email", |
| 244 | FirstResult(classifier->ClassifyText("you@android.com", {0, 15}))); |
| 245 | EXPECT_EQ("email", FirstResult(classifier->ClassifyText( |
| 246 | "Contact me at you@android.com", {14, 29}))); |
| 247 | |
| 248 | EXPECT_EQ("url", FirstResult(classifier->ClassifyText( |
| 249 | "Visit www.google.com every today!", {6, 20}))); |
| 250 | |
| 251 | EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("LX 37", {0, 5}))); |
| 252 | EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("flight LX 37 abcd", |
| 253 | {7, 12}))); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 254 | EXPECT_EQ("payment_card", FirstResult(classifier->ClassifyText( |
| 255 | "cc: 4012 8888 8888 1881", {4, 23}))); |
| 256 | EXPECT_EQ("payment_card", FirstResult(classifier->ClassifyText( |
| 257 | "2221 0067 4735 6281", {0, 19}))); |
| 258 | // Luhn check fails. |
| 259 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText("2221 0067 4735 6282", |
| 260 | {0, 19}))); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 261 | |
| 262 | // More lines. |
| 263 | EXPECT_EQ("url", |
| 264 | FirstResult(classifier->ClassifyText( |
| 265 | "this afternoon Barack Obama gave a speech at|Visit " |
| 266 | "www.google.com every today!|Call me at (800) 123-456 today.", |
| 267 | {51, 65}))); |
| 268 | } |
| 269 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 270 | |
| 271 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 272 | TEST_P(AnnotatorTest, SuggestSelectionRegularExpression) { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 273 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 274 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 275 | |
| 276 | // Add test regex models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 277 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 278 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 279 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 280 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 281 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 282 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 283 | unpacked_model->regex_model->patterns.back()->priority_score = 1.1; |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 284 | std::unique_ptr<RegexModel_::PatternT> verified_pattern = |
| 285 | MakePattern("payment_card", "(\\d{4}(?: \\d{4}){3})", |
| 286 | /*enabled_for_classification=*/false, |
| 287 | /*enabled_for_selection=*/true, |
| 288 | /*enabled_for_annotation=*/false, 1.0); |
| 289 | verified_pattern->verification_options.reset(new VerificationOptionsT); |
| 290 | verified_pattern->verification_options->verify_luhn_checksum = true; |
| 291 | unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 292 | |
| 293 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 294 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 295 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 296 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 297 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 298 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 299 | ASSERT_TRUE(classifier); |
| 300 | |
| 301 | // Check regular expression selection. |
| 302 | EXPECT_EQ(classifier->SuggestSelection( |
| 303 | "Your flight MA 0123 is delayed by 3 hours.", {12, 14}), |
| 304 | std::make_pair(12, 19)); |
| 305 | EXPECT_EQ(classifier->SuggestSelection( |
| 306 | "this afternoon Barack Obama gave a speech at", {15, 21}), |
| 307 | std::make_pair(15, 27)); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 308 | EXPECT_EQ(classifier->SuggestSelection("cc: 4012 8888 8888 1881", {9, 14}), |
| 309 | std::make_pair(4, 23)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 310 | } |
| 311 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 312 | |
| 313 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 314 | TEST_P(AnnotatorTest, SuggestSelectionRegularExpressionConflictsModelWins) { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 315 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 316 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 317 | |
| 318 | // Add test regex models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 319 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 320 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 321 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 322 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 323 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 324 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 325 | unpacked_model->regex_model->patterns.back()->priority_score = 0.5; |
| 326 | |
| 327 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 328 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 329 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 330 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 331 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 332 | builder.GetSize()); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 333 | ASSERT_TRUE(classifier); |
| 334 | |
| 335 | // Check conflict resolution. |
| 336 | EXPECT_EQ( |
| 337 | classifier->SuggestSelection( |
| 338 | "saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123", |
| 339 | {55, 57}), |
| 340 | std::make_pair(26, 62)); |
| 341 | } |
| 342 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 343 | |
| 344 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 345 | TEST_P(AnnotatorTest, SuggestSelectionRegularExpressionConflictsRegexWins) { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 346 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 347 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 348 | |
| 349 | // Add test regex models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 350 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 351 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 352 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 353 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 354 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 355 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 356 | unpacked_model->regex_model->patterns.back()->priority_score = 1.1; |
| 357 | |
| 358 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 359 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 360 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 361 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 362 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 363 | builder.GetSize()); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 364 | ASSERT_TRUE(classifier); |
| 365 | |
| 366 | // Check conflict resolution. |
| 367 | EXPECT_EQ( |
| 368 | classifier->SuggestSelection( |
| 369 | "saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123", |
| 370 | {55, 57}), |
| 371 | std::make_pair(55, 62)); |
| 372 | } |
| 373 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 374 | |
| 375 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 376 | TEST_P(AnnotatorTest, AnnotateRegex) { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 377 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 378 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 379 | |
| 380 | // Add test regex models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 381 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 382 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 383 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 1.0)); |
| 384 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 385 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 386 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 0.5)); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 387 | std::unique_ptr<RegexModel_::PatternT> verified_pattern = |
| 388 | MakePattern("payment_card", "(\\d{4}(?: \\d{4}){3})", |
| 389 | /*enabled_for_classification=*/false, |
| 390 | /*enabled_for_selection=*/false, |
| 391 | /*enabled_for_annotation=*/true, 1.0); |
| 392 | verified_pattern->verification_options.reset(new VerificationOptionsT); |
| 393 | verified_pattern->verification_options->verify_luhn_checksum = true; |
| 394 | unpacked_model->regex_model->patterns.push_back(std::move(verified_pattern)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 395 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 396 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 397 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 398 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 399 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 400 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 401 | ASSERT_TRUE(classifier); |
| 402 | |
| 403 | const std::string test_string = |
| 404 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 405 | "number is 853 225 3556\nand my card is 4012 8888 8888 1881.\n"; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 406 | EXPECT_THAT(classifier->Annotate(test_string), |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 407 | ElementsAreArray({IsAnnotatedSpan(6, 18, "person"), |
| 408 | IsAnnotatedSpan(28, 55, "address"), |
| 409 | IsAnnotatedSpan(79, 91, "phone"), |
| 410 | IsAnnotatedSpan(107, 126, "payment_card")})); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 411 | } |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 412 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 413 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 414 | TEST_P(AnnotatorTest, PhoneFiltering) { |
| 415 | std::unique_ptr<Annotator> classifier = |
| 416 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 417 | ASSERT_TRUE(classifier); |
| 418 | |
| 419 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 420 | "phone: (123) 456 789", {7, 20}))); |
| 421 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 422 | "phone: (123) 456 789,0001112", {7, 25}))); |
| 423 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText( |
| 424 | "phone: (123) 456 789,0001112", {7, 28}))); |
| 425 | } |
| 426 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 427 | TEST_P(AnnotatorTest, SuggestSelection) { |
| 428 | std::unique_ptr<Annotator> classifier = |
| 429 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 430 | ASSERT_TRUE(classifier); |
| 431 | |
| 432 | EXPECT_EQ(classifier->SuggestSelection( |
| 433 | "this afternoon Barack Obama gave a speech at", {15, 21}), |
| 434 | std::make_pair(15, 21)); |
| 435 | |
| 436 | // Try passing whole string. |
| 437 | // If more than 1 token is specified, we should return back what entered. |
| 438 | EXPECT_EQ( |
| 439 | classifier->SuggestSelection("350 Third Street, Cambridge", {0, 27}), |
| 440 | std::make_pair(0, 27)); |
| 441 | |
| 442 | // Single letter. |
| 443 | EXPECT_EQ(classifier->SuggestSelection("a", {0, 1}), std::make_pair(0, 1)); |
| 444 | |
| 445 | // Single word. |
| 446 | EXPECT_EQ(classifier->SuggestSelection("asdf", {0, 4}), std::make_pair(0, 4)); |
| 447 | |
| 448 | EXPECT_EQ( |
| 449 | classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}), |
| 450 | std::make_pair(11, 23)); |
| 451 | |
| 452 | // Unpaired bracket stripping. |
| 453 | EXPECT_EQ( |
| 454 | classifier->SuggestSelection("call me at (857) 225 3556 today", {11, 16}), |
| 455 | std::make_pair(11, 25)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 456 | EXPECT_EQ(classifier->SuggestSelection("call me at (857 today", {11, 15}), |
| 457 | std::make_pair(12, 15)); |
| 458 | EXPECT_EQ(classifier->SuggestSelection("call me at 3556) today", {11, 16}), |
| 459 | std::make_pair(11, 15)); |
| 460 | EXPECT_EQ(classifier->SuggestSelection("call me at )857( today", {11, 16}), |
| 461 | std::make_pair(12, 15)); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 462 | |
| 463 | // If the resulting selection would be empty, the original span is returned. |
| 464 | EXPECT_EQ(classifier->SuggestSelection("call me at )( today", {11, 13}), |
| 465 | std::make_pair(11, 13)); |
| 466 | EXPECT_EQ(classifier->SuggestSelection("call me at ( today", {11, 12}), |
| 467 | std::make_pair(11, 12)); |
| 468 | EXPECT_EQ(classifier->SuggestSelection("call me at ) today", {11, 12}), |
| 469 | std::make_pair(11, 12)); |
| 470 | } |
| 471 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 472 | TEST_P(AnnotatorTest, SuggestSelectionDisabledFail) { |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 473 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 474 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 475 | |
| 476 | // Disable the selection model. |
| 477 | unpacked_model->selection_model.clear(); |
| 478 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 479 | unpacked_model->triggering_options->enabled_modes = ModeFlag_ANNOTATION; |
| 480 | |
| 481 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 482 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 483 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 484 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 485 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 486 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 487 | // Selection model needs to be present for annotation. |
| 488 | ASSERT_FALSE(classifier); |
| 489 | } |
| 490 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 491 | TEST_P(AnnotatorTest, SuggestSelectionDisabled) { |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 492 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 493 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 494 | |
| 495 | // Disable the selection model. |
| 496 | unpacked_model->selection_model.clear(); |
| 497 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 498 | unpacked_model->triggering_options->enabled_modes = ModeFlag_CLASSIFICATION; |
| 499 | unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION; |
| 500 | |
| 501 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 502 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 503 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 504 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 505 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 506 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 507 | ASSERT_TRUE(classifier); |
| 508 | |
| 509 | EXPECT_EQ( |
| 510 | classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}), |
| 511 | std::make_pair(11, 14)); |
| 512 | |
| 513 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 514 | "call me at (800) 123-456 today", {11, 24}))); |
| 515 | |
| 516 | EXPECT_THAT(classifier->Annotate("call me at (800) 123-456 today"), |
| 517 | IsEmpty()); |
| 518 | } |
| 519 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 520 | TEST_P(AnnotatorTest, SuggestSelectionFilteredCollections) { |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 521 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 522 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 523 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 524 | test_model.c_str(), test_model.size(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 525 | ASSERT_TRUE(classifier); |
| 526 | |
| 527 | EXPECT_EQ( |
| 528 | classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}), |
| 529 | std::make_pair(11, 23)); |
| 530 | |
| 531 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 532 | unpacked_model->output_options.reset(new OutputOptionsT); |
| 533 | |
| 534 | // Disable phone selection |
| 535 | unpacked_model->output_options->filtered_collections_selection.push_back( |
| 536 | "phone"); |
| 537 | // We need to force this for filtering. |
| 538 | unpacked_model->selection_options->always_classify_suggested_selection = true; |
| 539 | |
| 540 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 541 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 542 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 543 | classifier = Annotator::FromUnownedBuffer( |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 544 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 545 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 546 | ASSERT_TRUE(classifier); |
| 547 | |
| 548 | EXPECT_EQ( |
| 549 | classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}), |
| 550 | std::make_pair(11, 14)); |
| 551 | |
| 552 | // Address selection should still work. |
| 553 | EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}), |
| 554 | std::make_pair(0, 27)); |
| 555 | } |
| 556 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 557 | TEST_P(AnnotatorTest, SuggestSelectionsAreSymmetric) { |
| 558 | std::unique_ptr<Annotator> classifier = |
| 559 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 560 | ASSERT_TRUE(classifier); |
| 561 | |
| 562 | EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {0, 3}), |
| 563 | std::make_pair(0, 27)); |
| 564 | EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}), |
| 565 | std::make_pair(0, 27)); |
| 566 | EXPECT_EQ( |
| 567 | classifier->SuggestSelection("350 Third Street, Cambridge", {10, 16}), |
| 568 | std::make_pair(0, 27)); |
| 569 | EXPECT_EQ(classifier->SuggestSelection("a\nb\nc\n350 Third Street, Cambridge", |
| 570 | {16, 22}), |
| 571 | std::make_pair(6, 33)); |
| 572 | } |
| 573 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 574 | TEST_P(AnnotatorTest, SuggestSelectionWithNewLine) { |
| 575 | std::unique_ptr<Annotator> classifier = |
| 576 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 577 | ASSERT_TRUE(classifier); |
| 578 | |
| 579 | EXPECT_EQ(classifier->SuggestSelection("abc\n857 225 3556", {4, 7}), |
| 580 | std::make_pair(4, 16)); |
| 581 | EXPECT_EQ(classifier->SuggestSelection("857 225 3556\nabc", {0, 3}), |
| 582 | std::make_pair(0, 12)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 583 | |
| 584 | SelectionOptions options; |
| 585 | EXPECT_EQ(classifier->SuggestSelection("857 225\n3556\nabc", {0, 3}, options), |
| 586 | std::make_pair(0, 7)); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 587 | } |
| 588 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 589 | TEST_P(AnnotatorTest, SuggestSelectionWithPunctuation) { |
| 590 | std::unique_ptr<Annotator> classifier = |
| 591 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 592 | ASSERT_TRUE(classifier); |
| 593 | |
| 594 | // From the right. |
| 595 | EXPECT_EQ(classifier->SuggestSelection( |
| 596 | "this afternoon BarackObama, gave a speech at", {15, 26}), |
| 597 | std::make_pair(15, 26)); |
| 598 | |
| 599 | // From the right multiple. |
| 600 | EXPECT_EQ(classifier->SuggestSelection( |
| 601 | "this afternoon BarackObama,.,.,, gave a speech at", {15, 26}), |
| 602 | std::make_pair(15, 26)); |
| 603 | |
| 604 | // From the left multiple. |
| 605 | EXPECT_EQ(classifier->SuggestSelection( |
| 606 | "this afternoon ,.,.,,BarackObama gave a speech at", {21, 32}), |
| 607 | std::make_pair(21, 32)); |
| 608 | |
| 609 | // From both sides. |
| 610 | EXPECT_EQ(classifier->SuggestSelection( |
| 611 | "this afternoon !BarackObama,- gave a speech at", {16, 27}), |
| 612 | std::make_pair(16, 27)); |
| 613 | } |
| 614 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 615 | TEST_P(AnnotatorTest, SuggestSelectionNoCrashWithJunk) { |
| 616 | std::unique_ptr<Annotator> classifier = |
| 617 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 618 | ASSERT_TRUE(classifier); |
| 619 | |
| 620 | // Try passing in bunch of invalid selections. |
| 621 | EXPECT_EQ(classifier->SuggestSelection("", {0, 27}), std::make_pair(0, 27)); |
| 622 | EXPECT_EQ(classifier->SuggestSelection("", {-10, 27}), |
| 623 | std::make_pair(-10, 27)); |
| 624 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {0, 27}), |
| 625 | std::make_pair(0, 27)); |
| 626 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-30, 300}), |
| 627 | std::make_pair(-30, 300)); |
| 628 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-10, -1}), |
| 629 | std::make_pair(-10, -1)); |
| 630 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {100, 17}), |
| 631 | std::make_pair(100, 17)); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 632 | |
| 633 | // Try passing invalid utf8. |
| 634 | EXPECT_EQ(classifier->SuggestSelection("\xf0\x9f\x98\x8b\x8b", {-1, -1}), |
| 635 | std::make_pair(-1, -1)); |
| 636 | } |
| 637 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 638 | TEST_P(AnnotatorTest, SuggestSelectionSelectSpace) { |
| 639 | std::unique_ptr<Annotator> classifier = |
| 640 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 641 | ASSERT_TRUE(classifier); |
| 642 | |
| 643 | EXPECT_EQ( |
| 644 | classifier->SuggestSelection("call me at 857 225 3556 today", {14, 15}), |
| 645 | std::make_pair(11, 23)); |
| 646 | EXPECT_EQ( |
| 647 | classifier->SuggestSelection("call me at 857 225 3556 today", {10, 11}), |
| 648 | std::make_pair(10, 11)); |
| 649 | EXPECT_EQ( |
| 650 | classifier->SuggestSelection("call me at 857 225 3556 today", {23, 24}), |
| 651 | std::make_pair(23, 24)); |
| 652 | EXPECT_EQ( |
| 653 | classifier->SuggestSelection("call me at 857 225 3556, today", {23, 24}), |
| 654 | std::make_pair(23, 24)); |
| 655 | EXPECT_EQ(classifier->SuggestSelection("call me at 857 225 3556, today", |
| 656 | {14, 17}), |
| 657 | std::make_pair(11, 25)); |
| 658 | EXPECT_EQ( |
| 659 | classifier->SuggestSelection("call me at 857-225 3556, today", {14, 17}), |
| 660 | std::make_pair(11, 23)); |
| 661 | EXPECT_EQ( |
| 662 | classifier->SuggestSelection( |
| 663 | "let's meet at 350 Third Street Cambridge and go there", {30, 31}), |
| 664 | std::make_pair(14, 40)); |
| 665 | EXPECT_EQ(classifier->SuggestSelection("call me today", {4, 5}), |
| 666 | std::make_pair(4, 5)); |
| 667 | EXPECT_EQ(classifier->SuggestSelection("call me today", {7, 8}), |
| 668 | std::make_pair(7, 8)); |
| 669 | |
| 670 | // With a punctuation around the selected whitespace. |
| 671 | EXPECT_EQ( |
| 672 | classifier->SuggestSelection( |
| 673 | "let's meet at 350 Third Street, Cambridge and go there", {31, 32}), |
| 674 | std::make_pair(14, 41)); |
| 675 | |
| 676 | // When all's whitespace, should return the original indices. |
| 677 | EXPECT_EQ(classifier->SuggestSelection(" ", {0, 1}), |
| 678 | std::make_pair(0, 1)); |
| 679 | EXPECT_EQ(classifier->SuggestSelection(" ", {0, 3}), |
| 680 | std::make_pair(0, 3)); |
| 681 | EXPECT_EQ(classifier->SuggestSelection(" ", {2, 3}), |
| 682 | std::make_pair(2, 3)); |
| 683 | EXPECT_EQ(classifier->SuggestSelection(" ", {5, 6}), |
| 684 | std::make_pair(5, 6)); |
| 685 | } |
| 686 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 687 | TEST_F(AnnotatorTest, SnapLeftIfWhitespaceSelection) { |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 688 | UnicodeText text; |
| 689 | |
| 690 | text = UTF8ToUnicodeText("abcd efgh", /*do_copy=*/false); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 691 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_), |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 692 | std::make_pair(3, 4)); |
| 693 | text = UTF8ToUnicodeText("abcd ", /*do_copy=*/false); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 694 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_), |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 695 | std::make_pair(3, 4)); |
| 696 | |
| 697 | // Nothing on the left. |
| 698 | text = UTF8ToUnicodeText(" efgh", /*do_copy=*/false); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 699 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_), |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 700 | std::make_pair(4, 5)); |
| 701 | text = UTF8ToUnicodeText(" efgh", /*do_copy=*/false); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 702 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({0, 1}, text, unilib_), |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 703 | std::make_pair(0, 1)); |
| 704 | |
| 705 | // Whitespace only. |
| 706 | text = UTF8ToUnicodeText(" ", /*do_copy=*/false); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 707 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({2, 3}, text, unilib_), |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 708 | std::make_pair(2, 3)); |
| 709 | text = UTF8ToUnicodeText(" ", /*do_copy=*/false); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 710 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib_), |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 711 | std::make_pair(4, 5)); |
| 712 | text = UTF8ToUnicodeText(" ", /*do_copy=*/false); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 713 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({0, 1}, text, unilib_), |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 714 | std::make_pair(0, 1)); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 715 | } |
| 716 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 717 | TEST_P(AnnotatorTest, Annotate) { |
| 718 | std::unique_ptr<Annotator> classifier = |
| 719 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 720 | ASSERT_TRUE(classifier); |
| 721 | |
| 722 | const std::string test_string = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 723 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 724 | "number is 853 225 3556"; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 725 | EXPECT_THAT(classifier->Annotate(test_string), |
| 726 | ElementsAreArray({ |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 727 | IsAnnotatedSpan(28, 55, "address"), |
| 728 | IsAnnotatedSpan(79, 91, "phone"), |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 729 | })); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 730 | |
| 731 | AnnotationOptions options; |
| 732 | EXPECT_THAT(classifier->Annotate("853 225 3556", options), |
| 733 | ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")})); |
| 734 | EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty()); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 735 | |
| 736 | // Try passing invalid utf8. |
| 737 | EXPECT_TRUE( |
| 738 | classifier->Annotate("853 225 3556\n\xf0\x9f\x98\x8b\x8b", options) |
| 739 | .empty()); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 740 | } |
| 741 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 742 | TEST_P(AnnotatorTest, AnnotateSmallBatches) { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 743 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 744 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 745 | |
| 746 | // Set the batch size. |
| 747 | unpacked_model->selection_options->batch_size = 4; |
| 748 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 749 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 750 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 751 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 752 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 753 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 754 | ASSERT_TRUE(classifier); |
| 755 | |
| 756 | const std::string test_string = |
| 757 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 758 | "number is 853 225 3556"; |
| 759 | EXPECT_THAT(classifier->Annotate(test_string), |
| 760 | ElementsAreArray({ |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 761 | IsAnnotatedSpan(28, 55, "address"), |
| 762 | IsAnnotatedSpan(79, 91, "phone"), |
| 763 | })); |
| 764 | |
| 765 | AnnotationOptions options; |
| 766 | EXPECT_THAT(classifier->Annotate("853 225 3556", options), |
| 767 | ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")})); |
| 768 | EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty()); |
| 769 | } |
| 770 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 771 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 772 | TEST_P(AnnotatorTest, AnnotateFilteringDiscardAll) { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 773 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 774 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 775 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 776 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 777 | // Add test threshold. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 778 | unpacked_model->triggering_options->min_annotate_confidence = |
| 779 | 2.f; // Discards all results. |
| 780 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 781 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 782 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 783 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 784 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 785 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 786 | ASSERT_TRUE(classifier); |
| 787 | |
| 788 | const std::string test_string = |
| 789 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 790 | "number is 853 225 3556"; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 791 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 792 | EXPECT_EQ(classifier->Annotate(test_string).size(), 0); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 793 | } |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 794 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 795 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 796 | TEST_P(AnnotatorTest, AnnotateFilteringKeepAll) { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 797 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 798 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 799 | |
| 800 | // Add test thresholds. |
| 801 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 802 | unpacked_model->triggering_options->min_annotate_confidence = |
| 803 | 0.f; // Keeps all results. |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 804 | unpacked_model->triggering_options->enabled_modes = ModeFlag_ALL; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 805 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 806 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 807 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 808 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 809 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 810 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 811 | ASSERT_TRUE(classifier); |
| 812 | |
| 813 | const std::string test_string = |
| 814 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 815 | "number is 853 225 3556"; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 816 | EXPECT_EQ(classifier->Annotate(test_string).size(), 2); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 817 | } |
| 818 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 819 | TEST_P(AnnotatorTest, AnnotateDisabled) { |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 820 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 821 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 822 | |
| 823 | // Disable the model for annotation. |
| 824 | unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION_AND_SELECTION; |
| 825 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 826 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 827 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 828 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 829 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 830 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 831 | ASSERT_TRUE(classifier); |
| 832 | const std::string test_string = |
| 833 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 834 | "number is 853 225 3556"; |
| 835 | EXPECT_THAT(classifier->Annotate(test_string), IsEmpty()); |
| 836 | } |
| 837 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 838 | TEST_P(AnnotatorTest, AnnotateFilteredCollections) { |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 839 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 840 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 841 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 842 | test_model.c_str(), test_model.size(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 843 | ASSERT_TRUE(classifier); |
| 844 | |
| 845 | const std::string test_string = |
| 846 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 847 | "number is 853 225 3556"; |
| 848 | |
| 849 | EXPECT_THAT(classifier->Annotate(test_string), |
| 850 | ElementsAreArray({ |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 851 | IsAnnotatedSpan(28, 55, "address"), |
| 852 | IsAnnotatedSpan(79, 91, "phone"), |
| 853 | })); |
| 854 | |
| 855 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 856 | unpacked_model->output_options.reset(new OutputOptionsT); |
| 857 | |
| 858 | // Disable phone annotation |
| 859 | unpacked_model->output_options->filtered_collections_annotation.push_back( |
| 860 | "phone"); |
| 861 | |
| 862 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 863 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 864 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 865 | classifier = Annotator::FromUnownedBuffer( |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 866 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 867 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 868 | ASSERT_TRUE(classifier); |
| 869 | |
| 870 | EXPECT_THAT(classifier->Annotate(test_string), |
| 871 | ElementsAreArray({ |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 872 | IsAnnotatedSpan(28, 55, "address"), |
| 873 | })); |
| 874 | } |
| 875 | |
| 876 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 877 | TEST_P(AnnotatorTest, AnnotateFilteredCollectionsSuppress) { |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 878 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 879 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 880 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 881 | test_model.c_str(), test_model.size(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 882 | ASSERT_TRUE(classifier); |
| 883 | |
| 884 | const std::string test_string = |
| 885 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 886 | "number is 853 225 3556"; |
| 887 | |
| 888 | EXPECT_THAT(classifier->Annotate(test_string), |
| 889 | ElementsAreArray({ |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 890 | IsAnnotatedSpan(28, 55, "address"), |
| 891 | IsAnnotatedSpan(79, 91, "phone"), |
| 892 | })); |
| 893 | |
| 894 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 895 | unpacked_model->output_options.reset(new OutputOptionsT); |
| 896 | |
| 897 | // We add a custom annotator that wins against the phone classification |
| 898 | // below and that we subsequently suppress. |
| 899 | unpacked_model->output_options->filtered_collections_annotation.push_back( |
| 900 | "suppress"); |
| 901 | |
| 902 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 903 | "suppress", "(\\d{3} ?\\d{4})", |
| 904 | /*enabled_for_classification=*/false, |
| 905 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 2.0)); |
| 906 | |
| 907 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 908 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 909 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 910 | classifier = Annotator::FromUnownedBuffer( |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 911 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 912 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 913 | ASSERT_TRUE(classifier); |
| 914 | |
| 915 | EXPECT_THAT(classifier->Annotate(test_string), |
| 916 | ElementsAreArray({ |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 917 | IsAnnotatedSpan(28, 55, "address"), |
| 918 | })); |
| 919 | } |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 920 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 921 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 922 | #ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 923 | TEST_P(AnnotatorTest, ClassifyTextDate) { |
| 924 | std::unique_ptr<Annotator> classifier = |
| 925 | Annotator::FromPath(GetModelPath() + GetParam()); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 926 | EXPECT_TRUE(classifier); |
| 927 | |
| 928 | std::vector<ClassificationResult> result; |
| 929 | ClassificationOptions options; |
| 930 | |
| 931 | options.reference_timezone = "Europe/Zurich"; |
| 932 | result = classifier->ClassifyText("january 1, 2017", {0, 15}, options); |
| 933 | |
| 934 | ASSERT_EQ(result.size(), 1); |
| 935 | EXPECT_THAT(result[0].collection, "date"); |
| 936 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000); |
| 937 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 938 | DatetimeGranularity::GRANULARITY_DAY); |
| 939 | result.clear(); |
| 940 | |
| 941 | options.reference_timezone = "America/Los_Angeles"; |
| 942 | result = classifier->ClassifyText("march 1, 2017", {0, 13}, options); |
| 943 | ASSERT_EQ(result.size(), 1); |
| 944 | EXPECT_THAT(result[0].collection, "date"); |
| 945 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1488355200000); |
| 946 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 947 | DatetimeGranularity::GRANULARITY_DAY); |
| 948 | result.clear(); |
| 949 | |
| 950 | options.reference_timezone = "America/Los_Angeles"; |
| 951 | result = classifier->ClassifyText("2018/01/01 10:30:20", {0, 19}, options); |
| 952 | ASSERT_EQ(result.size(), 1); |
| 953 | EXPECT_THAT(result[0].collection, "date"); |
| 954 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1514831420000); |
| 955 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 956 | DatetimeGranularity::GRANULARITY_SECOND); |
| 957 | result.clear(); |
| 958 | |
| 959 | // Date on another line. |
| 960 | options.reference_timezone = "Europe/Zurich"; |
| 961 | result = classifier->ClassifyText( |
| 962 | "hello world this is the first line\n" |
| 963 | "january 1, 2017", |
| 964 | {35, 50}, options); |
| 965 | ASSERT_EQ(result.size(), 1); |
| 966 | EXPECT_THAT(result[0].collection, "date"); |
| 967 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000); |
| 968 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 969 | DatetimeGranularity::GRANULARITY_DAY); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 970 | } |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 971 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 972 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 973 | #ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 974 | TEST_P(AnnotatorTest, ClassifyTextDatePriorities) { |
| 975 | std::unique_ptr<Annotator> classifier = |
| 976 | Annotator::FromPath(GetModelPath() + GetParam()); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 977 | EXPECT_TRUE(classifier); |
| 978 | |
| 979 | std::vector<ClassificationResult> result; |
| 980 | ClassificationOptions options; |
| 981 | |
| 982 | result.clear(); |
| 983 | options.reference_timezone = "Europe/Zurich"; |
| 984 | options.locales = "en-US"; |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 985 | result = classifier->ClassifyText("03.05.1970", {0, 10}, options); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 986 | |
| 987 | ASSERT_EQ(result.size(), 1); |
| 988 | EXPECT_THAT(result[0].collection, "date"); |
| 989 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 5439600000); |
| 990 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 991 | DatetimeGranularity::GRANULARITY_DAY); |
| 992 | |
| 993 | result.clear(); |
| 994 | options.reference_timezone = "Europe/Zurich"; |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 995 | options.locales = "de"; |
| 996 | result = classifier->ClassifyText("03.05.1970", {0, 10}, options); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 997 | |
| 998 | ASSERT_EQ(result.size(), 1); |
| 999 | EXPECT_THAT(result[0].collection, "date"); |
| 1000 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 10537200000); |
| 1001 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 1002 | DatetimeGranularity::GRANULARITY_DAY); |
| 1003 | } |
| 1004 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1005 | |
| 1006 | #ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1007 | TEST_P(AnnotatorTest, SuggestTextDateDisabled) { |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1008 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 1009 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 1010 | |
| 1011 | // Disable the patterns for selection. |
| 1012 | for (int i = 0; i < unpacked_model->datetime_model->patterns.size(); i++) { |
| 1013 | unpacked_model->datetime_model->patterns[i]->enabled_modes = |
| 1014 | ModeFlag_ANNOTATION_AND_CLASSIFICATION; |
| 1015 | } |
| 1016 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 1017 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1018 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1019 | std::unique_ptr<Annotator> classifier = Annotator::FromUnownedBuffer( |
| 1020 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 1021 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1022 | ASSERT_TRUE(classifier); |
| 1023 | EXPECT_EQ("date", |
| 1024 | FirstResult(classifier->ClassifyText("january 1, 2017", {0, 15}))); |
| 1025 | EXPECT_EQ(classifier->SuggestSelection("january 1, 2017", {0, 7}), |
| 1026 | std::make_pair(0, 7)); |
| 1027 | EXPECT_THAT(classifier->Annotate("january 1, 2017"), |
| 1028 | ElementsAreArray({IsAnnotatedSpan(0, 15, "date")})); |
| 1029 | } |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1030 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1031 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1032 | class TestingAnnotator : public Annotator { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1033 | public: |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1034 | TestingAnnotator(const std::string& model, const UniLib* unilib, |
| 1035 | const CalendarLib* calendarlib) |
| 1036 | : Annotator(ViewModel(model.data(), model.size()), unilib, calendarlib) {} |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1037 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1038 | using Annotator::ResolveConflicts; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1039 | }; |
| 1040 | |
| 1041 | AnnotatedSpan MakeAnnotatedSpan(CodepointSpan span, |
| 1042 | const std::string& collection, |
| 1043 | const float score) { |
| 1044 | AnnotatedSpan result; |
| 1045 | result.span = span; |
| 1046 | result.classification.push_back({collection, score}); |
| 1047 | return result; |
| 1048 | } |
| 1049 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1050 | TEST_F(AnnotatorTest, ResolveConflictsTrivial) { |
| 1051 | TestingAnnotator classifier("", &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1052 | |
| 1053 | std::vector<AnnotatedSpan> candidates{ |
| 1054 | {MakeAnnotatedSpan({0, 1}, "phone", 1.0)}}; |
| 1055 | |
| 1056 | std::vector<int> chosen; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1057 | classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{}, |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1058 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1059 | EXPECT_THAT(chosen, ElementsAreArray({0})); |
| 1060 | } |
| 1061 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1062 | TEST_F(AnnotatorTest, ResolveConflictsSequence) { |
| 1063 | TestingAnnotator classifier("", &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1064 | |
| 1065 | std::vector<AnnotatedSpan> candidates{{ |
| 1066 | MakeAnnotatedSpan({0, 1}, "phone", 1.0), |
| 1067 | MakeAnnotatedSpan({1, 2}, "phone", 1.0), |
| 1068 | MakeAnnotatedSpan({2, 3}, "phone", 1.0), |
| 1069 | MakeAnnotatedSpan({3, 4}, "phone", 1.0), |
| 1070 | MakeAnnotatedSpan({4, 5}, "phone", 1.0), |
| 1071 | }}; |
| 1072 | |
| 1073 | std::vector<int> chosen; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1074 | classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{}, |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1075 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1076 | EXPECT_THAT(chosen, ElementsAreArray({0, 1, 2, 3, 4})); |
| 1077 | } |
| 1078 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1079 | TEST_F(AnnotatorTest, ResolveConflictsThreeSpans) { |
| 1080 | TestingAnnotator classifier("", &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1081 | |
| 1082 | std::vector<AnnotatedSpan> candidates{{ |
| 1083 | MakeAnnotatedSpan({0, 3}, "phone", 1.0), |
| 1084 | MakeAnnotatedSpan({1, 5}, "phone", 0.5), // Looser! |
| 1085 | MakeAnnotatedSpan({3, 7}, "phone", 1.0), |
| 1086 | }}; |
| 1087 | |
| 1088 | std::vector<int> chosen; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1089 | classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{}, |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1090 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1091 | EXPECT_THAT(chosen, ElementsAreArray({0, 2})); |
| 1092 | } |
| 1093 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1094 | TEST_F(AnnotatorTest, ResolveConflictsThreeSpansReversed) { |
| 1095 | TestingAnnotator classifier("", &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1096 | |
| 1097 | std::vector<AnnotatedSpan> candidates{{ |
| 1098 | MakeAnnotatedSpan({0, 3}, "phone", 0.5), // Looser! |
| 1099 | MakeAnnotatedSpan({1, 5}, "phone", 1.0), |
| 1100 | MakeAnnotatedSpan({3, 7}, "phone", 0.6), // Looser! |
| 1101 | }}; |
| 1102 | |
| 1103 | std::vector<int> chosen; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1104 | classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{}, |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1105 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1106 | EXPECT_THAT(chosen, ElementsAreArray({1})); |
| 1107 | } |
| 1108 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1109 | TEST_F(AnnotatorTest, ResolveConflictsFiveSpans) { |
| 1110 | TestingAnnotator classifier("", &unilib_, &calendarlib_); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1111 | |
| 1112 | std::vector<AnnotatedSpan> candidates{{ |
| 1113 | MakeAnnotatedSpan({0, 3}, "phone", 0.5), |
| 1114 | MakeAnnotatedSpan({1, 5}, "other", 1.0), // Looser! |
| 1115 | MakeAnnotatedSpan({3, 7}, "phone", 0.6), |
| 1116 | MakeAnnotatedSpan({8, 12}, "phone", 0.6), // Looser! |
| 1117 | MakeAnnotatedSpan({11, 15}, "phone", 0.9), |
| 1118 | }}; |
| 1119 | |
| 1120 | std::vector<int> chosen; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1121 | classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{}, |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1122 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1123 | EXPECT_THAT(chosen, ElementsAreArray({0, 2, 4})); |
| 1124 | } |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 1125 | |
Lukas Zilka | df710db | 2018-02-27 12:44:09 +0100 | [diff] [blame] | 1126 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1127 | TEST_P(AnnotatorTest, LongInput) { |
| 1128 | std::unique_ptr<Annotator> classifier = |
| 1129 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | df710db | 2018-02-27 12:44:09 +0100 | [diff] [blame] | 1130 | ASSERT_TRUE(classifier); |
| 1131 | |
| 1132 | for (const auto& type_value_pair : |
| 1133 | std::vector<std::pair<std::string, std::string>>{ |
| 1134 | {"address", "350 Third Street, Cambridge"}, |
| 1135 | {"phone", "123 456-7890"}, |
| 1136 | {"url", "www.google.com"}, |
| 1137 | {"email", "someone@gmail.com"}, |
| 1138 | {"flight", "LX 38"}, |
| 1139 | {"date", "September 1, 2018"}}) { |
| 1140 | const std::string input_100k = std::string(50000, ' ') + |
| 1141 | type_value_pair.second + |
| 1142 | std::string(50000, ' '); |
| 1143 | const int value_length = type_value_pair.second.size(); |
| 1144 | |
| 1145 | EXPECT_THAT(classifier->Annotate(input_100k), |
| 1146 | ElementsAreArray({IsAnnotatedSpan(50000, 50000 + value_length, |
| 1147 | type_value_pair.first)})); |
| 1148 | EXPECT_EQ(classifier->SuggestSelection(input_100k, {50000, 50001}), |
| 1149 | std::make_pair(50000, 50000 + value_length)); |
| 1150 | EXPECT_EQ(type_value_pair.first, |
| 1151 | FirstResult(classifier->ClassifyText( |
| 1152 | input_100k, {50000, 50000 + value_length}))); |
| 1153 | } |
| 1154 | } |
| 1155 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1156 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1157 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1158 | // These coarse tests are there only to make sure the execution happens in |
| 1159 | // reasonable amount of time. |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1160 | TEST_P(AnnotatorTest, LongInputNoResultCheck) { |
| 1161 | std::unique_ptr<Annotator> classifier = |
| 1162 | Annotator::FromPath(GetModelPath() + GetParam(), &unilib_, &calendarlib_); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1163 | ASSERT_TRUE(classifier); |
| 1164 | |
| 1165 | for (const std::string& value : |
| 1166 | std::vector<std::string>{"http://www.aaaaaaaaaaaaaaaaaaaa.com "}) { |
| 1167 | const std::string input_100k = |
| 1168 | std::string(50000, ' ') + value + std::string(50000, ' '); |
| 1169 | const int value_length = value.size(); |
| 1170 | |
| 1171 | classifier->Annotate(input_100k); |
| 1172 | classifier->SuggestSelection(input_100k, {50000, 50001}); |
| 1173 | classifier->ClassifyText(input_100k, {50000, 50000 + value_length}); |
| 1174 | } |
| 1175 | } |
| 1176 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1177 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1178 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1179 | TEST_P(AnnotatorTest, MaxTokenLength) { |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 1180 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 1181 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 1182 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1183 | std::unique_ptr<Annotator> classifier; |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 1184 | |
| 1185 | // With unrestricted number of tokens should behave normally. |
| 1186 | unpacked_model->classification_options->max_num_tokens = -1; |
| 1187 | |
| 1188 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 1189 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1190 | classifier = Annotator::FromUnownedBuffer( |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 1191 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1192 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 1193 | ASSERT_TRUE(classifier); |
| 1194 | |
| 1195 | EXPECT_EQ(FirstResult(classifier->ClassifyText( |
| 1196 | "I live at 350 Third Street, Cambridge.", {10, 37})), |
| 1197 | "address"); |
| 1198 | |
| 1199 | // Raise the maximum number of tokens to suppress the classification. |
| 1200 | unpacked_model->classification_options->max_num_tokens = 3; |
| 1201 | |
| 1202 | flatbuffers::FlatBufferBuilder builder2; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 1203 | FinishModelBuffer(builder2, Model::Pack(builder2, unpacked_model.get())); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1204 | classifier = Annotator::FromUnownedBuffer( |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 1205 | reinterpret_cast<const char*>(builder2.GetBufferPointer()), |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1206 | builder2.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 1207 | ASSERT_TRUE(classifier); |
| 1208 | |
| 1209 | EXPECT_EQ(FirstResult(classifier->ClassifyText( |
| 1210 | "I live at 350 Third Street, Cambridge.", {10, 37})), |
| 1211 | "other"); |
| 1212 | } |
| 1213 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1214 | |
| 1215 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1216 | TEST_P(AnnotatorTest, MinAddressTokenLength) { |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1217 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 1218 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 1219 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1220 | std::unique_ptr<Annotator> classifier; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1221 | |
| 1222 | // With unrestricted number of address tokens should behave normally. |
| 1223 | unpacked_model->classification_options->address_min_num_tokens = 0; |
| 1224 | |
| 1225 | flatbuffers::FlatBufferBuilder builder; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 1226 | FinishModelBuffer(builder, Model::Pack(builder, unpacked_model.get())); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1227 | classifier = Annotator::FromUnownedBuffer( |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1228 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1229 | builder.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1230 | ASSERT_TRUE(classifier); |
| 1231 | |
| 1232 | EXPECT_EQ(FirstResult(classifier->ClassifyText( |
| 1233 | "I live at 350 Third Street, Cambridge.", {10, 37})), |
| 1234 | "address"); |
| 1235 | |
| 1236 | // Raise number of address tokens to suppress the address classification. |
| 1237 | unpacked_model->classification_options->address_min_num_tokens = 5; |
| 1238 | |
| 1239 | flatbuffers::FlatBufferBuilder builder2; |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame^] | 1240 | FinishModelBuffer(builder2, Model::Pack(builder2, unpacked_model.get())); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1241 | classifier = Annotator::FromUnownedBuffer( |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1242 | reinterpret_cast<const char*>(builder2.GetBufferPointer()), |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1243 | builder2.GetSize(), &unilib_, &calendarlib_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 1244 | ASSERT_TRUE(classifier); |
| 1245 | |
| 1246 | EXPECT_EQ(FirstResult(classifier->ClassifyText( |
| 1247 | "I live at 350 Third Street, Cambridge.", {10, 37})), |
| 1248 | "other"); |
| 1249 | } |
| 1250 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1251 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 1252 | } // namespace |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 1253 | } // namespace libtextclassifier3 |