Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2017 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include "text-classifier.h" |
| 18 | |
| 19 | #include <fstream> |
| 20 | #include <iostream> |
| 21 | #include <memory> |
| 22 | #include <string> |
| 23 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 24 | #include "model_generated.h" |
| 25 | #include "types-test-util.h" |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 26 | #include "gmock/gmock.h" |
| 27 | #include "gtest/gtest.h" |
| 28 | |
| 29 | namespace libtextclassifier2 { |
| 30 | namespace { |
| 31 | |
| 32 | using testing::ElementsAreArray; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 33 | using testing::IsEmpty; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 34 | using testing::Pair; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 35 | using testing::Values; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 36 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 37 | std::string FirstResult(const std::vector<ClassificationResult>& results) { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 38 | if (results.empty()) { |
| 39 | return "<INVALID RESULTS>"; |
| 40 | } |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 41 | return results[0].collection; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 42 | } |
| 43 | |
| 44 | MATCHER_P3(IsAnnotatedSpan, start, end, best_class, "") { |
| 45 | return testing::Value(arg.span, Pair(start, end)) && |
| 46 | testing::Value(FirstResult(arg.classification), best_class); |
| 47 | } |
| 48 | |
| 49 | std::string ReadFile(const std::string& file_name) { |
| 50 | std::ifstream file_stream(file_name); |
| 51 | return std::string(std::istreambuf_iterator<char>(file_stream), {}); |
| 52 | } |
| 53 | |
| 54 | std::string GetModelPath() { |
| 55 | return LIBTEXTCLASSIFIER_TEST_DATA_DIR; |
| 56 | } |
| 57 | |
| 58 | TEST(TextClassifierTest, EmbeddingExecutorLoadingFails) { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 59 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 60 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 61 | TextClassifier::FromPath(GetModelPath() + "wrong_embeddings.fb", &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 62 | EXPECT_FALSE(classifier); |
| 63 | } |
| 64 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 65 | class TextClassifierTest : public ::testing::TestWithParam<const char*> {}; |
| 66 | |
| 67 | INSTANTIATE_TEST_CASE_P(ClickContext, TextClassifierTest, |
| 68 | Values("test_model_cc.fb")); |
| 69 | INSTANTIATE_TEST_CASE_P(BoundsSensitive, TextClassifierTest, |
| 70 | Values("test_model.fb")); |
| 71 | |
| 72 | TEST_P(TextClassifierTest, ClassifyText) { |
| 73 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 74 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 75 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 76 | ASSERT_TRUE(classifier); |
| 77 | |
| 78 | EXPECT_EQ("other", |
| 79 | FirstResult(classifier->ClassifyText( |
| 80 | "this afternoon Barack Obama gave a speech at", {15, 27}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 81 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 82 | "Call me at (800) 123-456 today", {11, 24}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 83 | |
| 84 | // More lines. |
| 85 | EXPECT_EQ("other", |
| 86 | FirstResult(classifier->ClassifyText( |
| 87 | "this afternoon Barack Obama gave a speech at|Visit " |
| 88 | "www.google.com every today!|Call me at (800) 123-456 today.", |
| 89 | {15, 27}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 90 | EXPECT_EQ("phone", |
| 91 | FirstResult(classifier->ClassifyText( |
| 92 | "this afternoon Barack Obama gave a speech at|Visit " |
| 93 | "www.google.com every today!|Call me at (800) 123-456 today.", |
| 94 | {90, 103}))); |
| 95 | |
| 96 | // Single word. |
| 97 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText("obama", {0, 5}))); |
| 98 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText("asdf", {0, 4}))); |
| 99 | EXPECT_EQ("<INVALID RESULTS>", |
| 100 | FirstResult(classifier->ClassifyText("asdf", {0, 0}))); |
| 101 | |
| 102 | // Junk. |
| 103 | EXPECT_EQ("<INVALID RESULTS>", |
| 104 | FirstResult(classifier->ClassifyText("", {0, 0}))); |
| 105 | EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText( |
| 106 | "a\n\n\n\nx x x\n\n\n\n\n\n", {1, 5}))); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 107 | // Test invalid utf8 input. |
| 108 | EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText( |
| 109 | "\xf0\x9f\x98\x8b\x8b", {0, 0}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 110 | } |
| 111 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 112 | TEST_P(TextClassifierTest, ClassifyTextDisabledFail) { |
| 113 | CREATE_UNILIB_FOR_TESTING; |
| 114 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 115 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 116 | |
| 117 | unpacked_model->classification_model.clear(); |
| 118 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 119 | unpacked_model->triggering_options->enabled_modes = ModeFlag_SELECTION; |
| 120 | |
| 121 | flatbuffers::FlatBufferBuilder builder; |
| 122 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 123 | |
| 124 | std::unique_ptr<TextClassifier> classifier = |
| 125 | TextClassifier::FromUnownedBuffer( |
| 126 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 127 | builder.GetSize(), &unilib); |
| 128 | |
| 129 | // The classification model is still needed for selection scores. |
| 130 | ASSERT_FALSE(classifier); |
| 131 | } |
| 132 | |
| 133 | TEST_P(TextClassifierTest, ClassifyTextDisabled) { |
| 134 | CREATE_UNILIB_FOR_TESTING; |
| 135 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 136 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 137 | |
| 138 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 139 | unpacked_model->triggering_options->enabled_modes = |
| 140 | ModeFlag_ANNOTATION_AND_SELECTION; |
| 141 | |
| 142 | flatbuffers::FlatBufferBuilder builder; |
| 143 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 144 | |
| 145 | std::unique_ptr<TextClassifier> classifier = |
| 146 | TextClassifier::FromUnownedBuffer( |
| 147 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 148 | builder.GetSize(), &unilib); |
| 149 | ASSERT_TRUE(classifier); |
| 150 | |
| 151 | EXPECT_THAT( |
| 152 | classifier->ClassifyText("Call me at (800) 123-456 today", {11, 24}), |
| 153 | IsEmpty()); |
| 154 | } |
| 155 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 156 | TEST_P(TextClassifierTest, ClassifyTextFilteredCollections) { |
| 157 | CREATE_UNILIB_FOR_TESTING; |
| 158 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 159 | |
| 160 | std::unique_ptr<TextClassifier> classifier = |
| 161 | TextClassifier::FromUnownedBuffer(test_model.c_str(), test_model.size(), |
| 162 | &unilib); |
| 163 | ASSERT_TRUE(classifier); |
| 164 | |
| 165 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 166 | "Call me at (800) 123-456 today", {11, 24}))); |
| 167 | |
| 168 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 169 | unpacked_model->output_options.reset(new OutputOptionsT); |
| 170 | |
| 171 | // Disable phone classification |
| 172 | unpacked_model->output_options->filtered_collections_classification.push_back( |
| 173 | "phone"); |
| 174 | |
| 175 | flatbuffers::FlatBufferBuilder builder; |
| 176 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 177 | |
| 178 | classifier = TextClassifier::FromUnownedBuffer( |
| 179 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 180 | builder.GetSize(), &unilib); |
| 181 | ASSERT_TRUE(classifier); |
| 182 | |
| 183 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText( |
| 184 | "Call me at (800) 123-456 today", {11, 24}))); |
| 185 | |
| 186 | // Check that the address classification still passes. |
| 187 | EXPECT_EQ("address", FirstResult(classifier->ClassifyText( |
| 188 | "350 Third Street, Cambridge", {0, 27}))); |
| 189 | } |
| 190 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 191 | std::unique_ptr<RegexModel_::PatternT> MakePattern( |
| 192 | const std::string& collection_name, const std::string& pattern, |
| 193 | const bool enabled_for_classification, const bool enabled_for_selection, |
| 194 | const bool enabled_for_annotation, const float score) { |
| 195 | std::unique_ptr<RegexModel_::PatternT> result(new RegexModel_::PatternT); |
| 196 | result->collection_name = collection_name; |
| 197 | result->pattern = pattern; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 198 | // We cannot directly operate with |= on the flag, so use an int here. |
| 199 | int enabled_modes = ModeFlag_NONE; |
| 200 | if (enabled_for_annotation) enabled_modes |= ModeFlag_ANNOTATION; |
| 201 | if (enabled_for_classification) enabled_modes |= ModeFlag_CLASSIFICATION; |
| 202 | if (enabled_for_selection) enabled_modes |= ModeFlag_SELECTION; |
| 203 | result->enabled_modes = static_cast<ModeFlag>(enabled_modes); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 204 | result->target_classification_score = score; |
| 205 | result->priority_score = score; |
| 206 | return result; |
| 207 | } |
| 208 | |
| 209 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 210 | TEST_P(TextClassifierTest, ClassifyTextRegularExpression) { |
| 211 | CREATE_UNILIB_FOR_TESTING; |
| 212 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 213 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 214 | |
| 215 | // Add test regex models. |
| 216 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 217 | "person", "Barack Obama", /*enabled_for_classification=*/true, |
| 218 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 1.0)); |
| 219 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 220 | "flight", "[a-zA-Z]{2}\\d{2,4}", /*enabled_for_classification=*/true, |
| 221 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 0.5)); |
| 222 | |
| 223 | flatbuffers::FlatBufferBuilder builder; |
| 224 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 225 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 226 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 227 | TextClassifier::FromUnownedBuffer( |
| 228 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 229 | builder.GetSize(), &unilib); |
| 230 | ASSERT_TRUE(classifier); |
| 231 | |
| 232 | EXPECT_EQ("flight", |
| 233 | FirstResult(classifier->ClassifyText( |
| 234 | "Your flight LX373 is delayed by 3 hours.", {12, 17}))); |
| 235 | EXPECT_EQ("person", |
| 236 | FirstResult(classifier->ClassifyText( |
| 237 | "this afternoon Barack Obama gave a speech at", {15, 27}))); |
| 238 | EXPECT_EQ("email", |
| 239 | FirstResult(classifier->ClassifyText("you@android.com", {0, 15}))); |
| 240 | EXPECT_EQ("email", FirstResult(classifier->ClassifyText( |
| 241 | "Contact me at you@android.com", {14, 29}))); |
| 242 | |
| 243 | EXPECT_EQ("url", FirstResult(classifier->ClassifyText( |
| 244 | "Visit www.google.com every today!", {6, 20}))); |
| 245 | |
| 246 | EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("LX 37", {0, 5}))); |
| 247 | EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("flight LX 37 abcd", |
| 248 | {7, 12}))); |
| 249 | |
| 250 | // More lines. |
| 251 | EXPECT_EQ("url", |
| 252 | FirstResult(classifier->ClassifyText( |
| 253 | "this afternoon Barack Obama gave a speech at|Visit " |
| 254 | "www.google.com every today!|Call me at (800) 123-456 today.", |
| 255 | {51, 65}))); |
| 256 | } |
| 257 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 258 | |
| 259 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 260 | TEST_P(TextClassifierTest, SuggestSelectionRegularExpression) { |
| 261 | CREATE_UNILIB_FOR_TESTING; |
| 262 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 263 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 264 | |
| 265 | // Add test regex models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 266 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 267 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 268 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 269 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 270 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 271 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 272 | unpacked_model->regex_model->patterns.back()->priority_score = 1.1; |
| 273 | |
| 274 | flatbuffers::FlatBufferBuilder builder; |
| 275 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 276 | |
| 277 | std::unique_ptr<TextClassifier> classifier = |
| 278 | TextClassifier::FromUnownedBuffer( |
| 279 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 280 | builder.GetSize(), &unilib); |
| 281 | ASSERT_TRUE(classifier); |
| 282 | |
| 283 | // Check regular expression selection. |
| 284 | EXPECT_EQ(classifier->SuggestSelection( |
| 285 | "Your flight MA 0123 is delayed by 3 hours.", {12, 14}), |
| 286 | std::make_pair(12, 19)); |
| 287 | EXPECT_EQ(classifier->SuggestSelection( |
| 288 | "this afternoon Barack Obama gave a speech at", {15, 21}), |
| 289 | std::make_pair(15, 27)); |
| 290 | } |
| 291 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 292 | |
| 293 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 294 | TEST_P(TextClassifierTest, |
| 295 | SuggestSelectionRegularExpressionConflictsModelWins) { |
| 296 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 297 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 298 | |
| 299 | // Add test regex models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 300 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 301 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 302 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 303 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 304 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 305 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 306 | unpacked_model->regex_model->patterns.back()->priority_score = 0.5; |
| 307 | |
| 308 | flatbuffers::FlatBufferBuilder builder; |
| 309 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 310 | |
| 311 | std::unique_ptr<TextClassifier> classifier = |
| 312 | TextClassifier::FromUnownedBuffer( |
| 313 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 314 | builder.GetSize()); |
| 315 | ASSERT_TRUE(classifier); |
| 316 | |
| 317 | // Check conflict resolution. |
| 318 | EXPECT_EQ( |
| 319 | classifier->SuggestSelection( |
| 320 | "saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123", |
| 321 | {55, 57}), |
| 322 | std::make_pair(26, 62)); |
| 323 | } |
| 324 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 325 | |
| 326 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 327 | TEST_P(TextClassifierTest, |
| 328 | SuggestSelectionRegularExpressionConflictsRegexWins) { |
| 329 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 330 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 331 | |
| 332 | // Add test regex models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 333 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 334 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 335 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 336 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 337 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 338 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 339 | unpacked_model->regex_model->patterns.back()->priority_score = 1.1; |
| 340 | |
| 341 | flatbuffers::FlatBufferBuilder builder; |
| 342 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 343 | |
| 344 | std::unique_ptr<TextClassifier> classifier = |
| 345 | TextClassifier::FromUnownedBuffer( |
| 346 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 347 | builder.GetSize()); |
| 348 | ASSERT_TRUE(classifier); |
| 349 | |
| 350 | // Check conflict resolution. |
| 351 | EXPECT_EQ( |
| 352 | classifier->SuggestSelection( |
| 353 | "saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123", |
| 354 | {55, 57}), |
| 355 | std::make_pair(55, 62)); |
| 356 | } |
| 357 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 358 | |
| 359 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 360 | TEST_P(TextClassifierTest, AnnotateRegex) { |
| 361 | CREATE_UNILIB_FOR_TESTING; |
| 362 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 363 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 364 | |
| 365 | // Add test regex models. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 366 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 367 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 368 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 1.0)); |
| 369 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 370 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 371 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 0.5)); |
| 372 | flatbuffers::FlatBufferBuilder builder; |
| 373 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 374 | |
| 375 | std::unique_ptr<TextClassifier> classifier = |
| 376 | TextClassifier::FromUnownedBuffer( |
| 377 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 378 | builder.GetSize(), &unilib); |
| 379 | ASSERT_TRUE(classifier); |
| 380 | |
| 381 | const std::string test_string = |
| 382 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 383 | "number is 853 225 3556"; |
| 384 | EXPECT_THAT(classifier->Annotate(test_string), |
| 385 | ElementsAreArray({ |
| 386 | IsAnnotatedSpan(6, 18, "person"), |
| 387 | IsAnnotatedSpan(19, 24, "date"), |
| 388 | IsAnnotatedSpan(28, 55, "address"), |
| 389 | IsAnnotatedSpan(79, 91, "phone"), |
| 390 | })); |
| 391 | } |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 392 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 393 | |
| 394 | TEST_P(TextClassifierTest, PhoneFiltering) { |
| 395 | CREATE_UNILIB_FOR_TESTING; |
| 396 | std::unique_ptr<TextClassifier> classifier = |
| 397 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 398 | ASSERT_TRUE(classifier); |
| 399 | |
| 400 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 401 | "phone: (123) 456 789", {7, 20}))); |
| 402 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 403 | "phone: (123) 456 789,0001112", {7, 25}))); |
| 404 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText( |
| 405 | "phone: (123) 456 789,0001112", {7, 28}))); |
| 406 | } |
| 407 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 408 | TEST_P(TextClassifierTest, SuggestSelection) { |
| 409 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 410 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 411 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 412 | ASSERT_TRUE(classifier); |
| 413 | |
| 414 | EXPECT_EQ(classifier->SuggestSelection( |
| 415 | "this afternoon Barack Obama gave a speech at", {15, 21}), |
| 416 | std::make_pair(15, 21)); |
| 417 | |
| 418 | // Try passing whole string. |
| 419 | // If more than 1 token is specified, we should return back what entered. |
| 420 | EXPECT_EQ( |
| 421 | classifier->SuggestSelection("350 Third Street, Cambridge", {0, 27}), |
| 422 | std::make_pair(0, 27)); |
| 423 | |
| 424 | // Single letter. |
| 425 | EXPECT_EQ(classifier->SuggestSelection("a", {0, 1}), std::make_pair(0, 1)); |
| 426 | |
| 427 | // Single word. |
| 428 | EXPECT_EQ(classifier->SuggestSelection("asdf", {0, 4}), std::make_pair(0, 4)); |
| 429 | |
| 430 | EXPECT_EQ( |
| 431 | classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}), |
| 432 | std::make_pair(11, 23)); |
| 433 | |
| 434 | // Unpaired bracket stripping. |
| 435 | EXPECT_EQ( |
| 436 | classifier->SuggestSelection("call me at (857) 225 3556 today", {11, 16}), |
| 437 | std::make_pair(11, 25)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 438 | EXPECT_EQ(classifier->SuggestSelection("call me at (857 today", {11, 15}), |
| 439 | std::make_pair(12, 15)); |
| 440 | EXPECT_EQ(classifier->SuggestSelection("call me at 3556) today", {11, 16}), |
| 441 | std::make_pair(11, 15)); |
| 442 | EXPECT_EQ(classifier->SuggestSelection("call me at )857( today", {11, 16}), |
| 443 | std::make_pair(12, 15)); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 444 | |
| 445 | // If the resulting selection would be empty, the original span is returned. |
| 446 | EXPECT_EQ(classifier->SuggestSelection("call me at )( today", {11, 13}), |
| 447 | std::make_pair(11, 13)); |
| 448 | EXPECT_EQ(classifier->SuggestSelection("call me at ( today", {11, 12}), |
| 449 | std::make_pair(11, 12)); |
| 450 | EXPECT_EQ(classifier->SuggestSelection("call me at ) today", {11, 12}), |
| 451 | std::make_pair(11, 12)); |
| 452 | } |
| 453 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 454 | TEST_P(TextClassifierTest, SuggestSelectionDisabledFail) { |
| 455 | CREATE_UNILIB_FOR_TESTING; |
| 456 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 457 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 458 | |
| 459 | // Disable the selection model. |
| 460 | unpacked_model->selection_model.clear(); |
| 461 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 462 | unpacked_model->triggering_options->enabled_modes = ModeFlag_ANNOTATION; |
| 463 | |
| 464 | flatbuffers::FlatBufferBuilder builder; |
| 465 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 466 | |
| 467 | std::unique_ptr<TextClassifier> classifier = |
| 468 | TextClassifier::FromUnownedBuffer( |
| 469 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 470 | builder.GetSize(), &unilib); |
| 471 | // Selection model needs to be present for annotation. |
| 472 | ASSERT_FALSE(classifier); |
| 473 | } |
| 474 | |
| 475 | TEST_P(TextClassifierTest, SuggestSelectionDisabled) { |
| 476 | CREATE_UNILIB_FOR_TESTING; |
| 477 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 478 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 479 | |
| 480 | // Disable the selection model. |
| 481 | unpacked_model->selection_model.clear(); |
| 482 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 483 | unpacked_model->triggering_options->enabled_modes = ModeFlag_CLASSIFICATION; |
| 484 | unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION; |
| 485 | |
| 486 | flatbuffers::FlatBufferBuilder builder; |
| 487 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 488 | |
| 489 | std::unique_ptr<TextClassifier> classifier = |
| 490 | TextClassifier::FromUnownedBuffer( |
| 491 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 492 | builder.GetSize(), &unilib); |
| 493 | ASSERT_TRUE(classifier); |
| 494 | |
| 495 | EXPECT_EQ( |
| 496 | classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}), |
| 497 | std::make_pair(11, 14)); |
| 498 | |
| 499 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 500 | "call me at (800) 123-456 today", {11, 24}))); |
| 501 | |
| 502 | EXPECT_THAT(classifier->Annotate("call me at (800) 123-456 today"), |
| 503 | IsEmpty()); |
| 504 | } |
| 505 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 506 | TEST_P(TextClassifierTest, SuggestSelectionFilteredCollections) { |
| 507 | CREATE_UNILIB_FOR_TESTING; |
| 508 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 509 | |
| 510 | std::unique_ptr<TextClassifier> classifier = |
| 511 | TextClassifier::FromUnownedBuffer(test_model.c_str(), test_model.size(), |
| 512 | &unilib); |
| 513 | ASSERT_TRUE(classifier); |
| 514 | |
| 515 | EXPECT_EQ( |
| 516 | classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}), |
| 517 | std::make_pair(11, 23)); |
| 518 | |
| 519 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 520 | unpacked_model->output_options.reset(new OutputOptionsT); |
| 521 | |
| 522 | // Disable phone selection |
| 523 | unpacked_model->output_options->filtered_collections_selection.push_back( |
| 524 | "phone"); |
| 525 | // We need to force this for filtering. |
| 526 | unpacked_model->selection_options->always_classify_suggested_selection = true; |
| 527 | |
| 528 | flatbuffers::FlatBufferBuilder builder; |
| 529 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 530 | |
| 531 | classifier = TextClassifier::FromUnownedBuffer( |
| 532 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 533 | builder.GetSize(), &unilib); |
| 534 | ASSERT_TRUE(classifier); |
| 535 | |
| 536 | EXPECT_EQ( |
| 537 | classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}), |
| 538 | std::make_pair(11, 14)); |
| 539 | |
| 540 | // Address selection should still work. |
| 541 | EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}), |
| 542 | std::make_pair(0, 27)); |
| 543 | } |
| 544 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 545 | TEST_P(TextClassifierTest, SuggestSelectionsAreSymmetric) { |
| 546 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 547 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 548 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 549 | ASSERT_TRUE(classifier); |
| 550 | |
| 551 | EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {0, 3}), |
| 552 | std::make_pair(0, 27)); |
| 553 | EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}), |
| 554 | std::make_pair(0, 27)); |
| 555 | EXPECT_EQ( |
| 556 | classifier->SuggestSelection("350 Third Street, Cambridge", {10, 16}), |
| 557 | std::make_pair(0, 27)); |
| 558 | EXPECT_EQ(classifier->SuggestSelection("a\nb\nc\n350 Third Street, Cambridge", |
| 559 | {16, 22}), |
| 560 | std::make_pair(6, 33)); |
| 561 | } |
| 562 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 563 | TEST_P(TextClassifierTest, SuggestSelectionWithNewLine) { |
| 564 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 565 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 566 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 567 | ASSERT_TRUE(classifier); |
| 568 | |
| 569 | EXPECT_EQ(classifier->SuggestSelection("abc\n857 225 3556", {4, 7}), |
| 570 | std::make_pair(4, 16)); |
| 571 | EXPECT_EQ(classifier->SuggestSelection("857 225 3556\nabc", {0, 3}), |
| 572 | std::make_pair(0, 12)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 573 | |
| 574 | SelectionOptions options; |
| 575 | EXPECT_EQ(classifier->SuggestSelection("857 225\n3556\nabc", {0, 3}, options), |
| 576 | std::make_pair(0, 7)); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 577 | } |
| 578 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 579 | TEST_P(TextClassifierTest, SuggestSelectionWithPunctuation) { |
| 580 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 581 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 582 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 583 | ASSERT_TRUE(classifier); |
| 584 | |
| 585 | // From the right. |
| 586 | EXPECT_EQ(classifier->SuggestSelection( |
| 587 | "this afternoon BarackObama, gave a speech at", {15, 26}), |
| 588 | std::make_pair(15, 26)); |
| 589 | |
| 590 | // From the right multiple. |
| 591 | EXPECT_EQ(classifier->SuggestSelection( |
| 592 | "this afternoon BarackObama,.,.,, gave a speech at", {15, 26}), |
| 593 | std::make_pair(15, 26)); |
| 594 | |
| 595 | // From the left multiple. |
| 596 | EXPECT_EQ(classifier->SuggestSelection( |
| 597 | "this afternoon ,.,.,,BarackObama gave a speech at", {21, 32}), |
| 598 | std::make_pair(21, 32)); |
| 599 | |
| 600 | // From both sides. |
| 601 | EXPECT_EQ(classifier->SuggestSelection( |
| 602 | "this afternoon !BarackObama,- gave a speech at", {16, 27}), |
| 603 | std::make_pair(16, 27)); |
| 604 | } |
| 605 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 606 | TEST_P(TextClassifierTest, SuggestSelectionNoCrashWithJunk) { |
| 607 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 608 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 609 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 610 | ASSERT_TRUE(classifier); |
| 611 | |
| 612 | // Try passing in bunch of invalid selections. |
| 613 | EXPECT_EQ(classifier->SuggestSelection("", {0, 27}), std::make_pair(0, 27)); |
| 614 | EXPECT_EQ(classifier->SuggestSelection("", {-10, 27}), |
| 615 | std::make_pair(-10, 27)); |
| 616 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {0, 27}), |
| 617 | std::make_pair(0, 27)); |
| 618 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-30, 300}), |
| 619 | std::make_pair(-30, 300)); |
| 620 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-10, -1}), |
| 621 | std::make_pair(-10, -1)); |
| 622 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {100, 17}), |
| 623 | std::make_pair(100, 17)); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 624 | |
| 625 | // Try passing invalid utf8. |
| 626 | EXPECT_EQ(classifier->SuggestSelection("\xf0\x9f\x98\x8b\x8b", {-1, -1}), |
| 627 | std::make_pair(-1, -1)); |
| 628 | } |
| 629 | |
| 630 | TEST_P(TextClassifierTest, SuggestSelectionSelectSpace) { |
| 631 | CREATE_UNILIB_FOR_TESTING; |
| 632 | std::unique_ptr<TextClassifier> classifier = |
| 633 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
| 634 | ASSERT_TRUE(classifier); |
| 635 | |
| 636 | EXPECT_EQ( |
| 637 | classifier->SuggestSelection("call me at 857 225 3556 today", {14, 15}), |
| 638 | std::make_pair(11, 23)); |
| 639 | EXPECT_EQ( |
| 640 | classifier->SuggestSelection("call me at 857 225 3556 today", {10, 11}), |
| 641 | std::make_pair(10, 11)); |
| 642 | EXPECT_EQ( |
| 643 | classifier->SuggestSelection("call me at 857 225 3556 today", {23, 24}), |
| 644 | std::make_pair(23, 24)); |
| 645 | EXPECT_EQ( |
| 646 | classifier->SuggestSelection("call me at 857 225 3556, today", {23, 24}), |
| 647 | std::make_pair(23, 24)); |
| 648 | EXPECT_EQ(classifier->SuggestSelection("call me at 857 225 3556, today", |
| 649 | {14, 17}), |
| 650 | std::make_pair(11, 25)); |
| 651 | EXPECT_EQ( |
| 652 | classifier->SuggestSelection("call me at 857-225 3556, today", {14, 17}), |
| 653 | std::make_pair(11, 23)); |
| 654 | EXPECT_EQ( |
| 655 | classifier->SuggestSelection( |
| 656 | "let's meet at 350 Third Street Cambridge and go there", {30, 31}), |
| 657 | std::make_pair(14, 40)); |
| 658 | EXPECT_EQ(classifier->SuggestSelection("call me today", {4, 5}), |
| 659 | std::make_pair(4, 5)); |
| 660 | EXPECT_EQ(classifier->SuggestSelection("call me today", {7, 8}), |
| 661 | std::make_pair(7, 8)); |
| 662 | |
| 663 | // With a punctuation around the selected whitespace. |
| 664 | EXPECT_EQ( |
| 665 | classifier->SuggestSelection( |
| 666 | "let's meet at 350 Third Street, Cambridge and go there", {31, 32}), |
| 667 | std::make_pair(14, 41)); |
| 668 | |
| 669 | // When all's whitespace, should return the original indices. |
| 670 | EXPECT_EQ(classifier->SuggestSelection(" ", {0, 1}), |
| 671 | std::make_pair(0, 1)); |
| 672 | EXPECT_EQ(classifier->SuggestSelection(" ", {0, 3}), |
| 673 | std::make_pair(0, 3)); |
| 674 | EXPECT_EQ(classifier->SuggestSelection(" ", {2, 3}), |
| 675 | std::make_pair(2, 3)); |
| 676 | EXPECT_EQ(classifier->SuggestSelection(" ", {5, 6}), |
| 677 | std::make_pair(5, 6)); |
| 678 | } |
| 679 | |
| 680 | TEST(TextClassifierTest, SnapLeftIfWhitespaceSelection) { |
| 681 | CREATE_UNILIB_FOR_TESTING; |
| 682 | UnicodeText text; |
| 683 | |
| 684 | text = UTF8ToUnicodeText("abcd efgh", /*do_copy=*/false); |
| 685 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib), |
| 686 | std::make_pair(3, 4)); |
| 687 | text = UTF8ToUnicodeText("abcd ", /*do_copy=*/false); |
| 688 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib), |
| 689 | std::make_pair(3, 4)); |
| 690 | |
| 691 | // Nothing on the left. |
| 692 | text = UTF8ToUnicodeText(" efgh", /*do_copy=*/false); |
| 693 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib), |
| 694 | std::make_pair(4, 5)); |
| 695 | text = UTF8ToUnicodeText(" efgh", /*do_copy=*/false); |
| 696 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({0, 1}, text, unilib), |
| 697 | std::make_pair(0, 1)); |
| 698 | |
| 699 | // Whitespace only. |
| 700 | text = UTF8ToUnicodeText(" ", /*do_copy=*/false); |
| 701 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({2, 3}, text, unilib), |
| 702 | std::make_pair(2, 3)); |
| 703 | text = UTF8ToUnicodeText(" ", /*do_copy=*/false); |
| 704 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({4, 5}, text, unilib), |
| 705 | std::make_pair(4, 5)); |
| 706 | text = UTF8ToUnicodeText(" ", /*do_copy=*/false); |
| 707 | EXPECT_EQ(internal::SnapLeftIfWhitespaceSelection({0, 1}, text, unilib), |
| 708 | std::make_pair(0, 1)); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 709 | } |
| 710 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 711 | TEST_P(TextClassifierTest, Annotate) { |
| 712 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 713 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 714 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 715 | ASSERT_TRUE(classifier); |
| 716 | |
| 717 | const std::string test_string = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 718 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 719 | "number is 853 225 3556"; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 720 | EXPECT_THAT(classifier->Annotate(test_string), |
| 721 | ElementsAreArray({ |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 722 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 723 | IsAnnotatedSpan(19, 24, "date"), |
| 724 | #endif |
| 725 | IsAnnotatedSpan(28, 55, "address"), |
| 726 | IsAnnotatedSpan(79, 91, "phone"), |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 727 | })); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 728 | |
| 729 | AnnotationOptions options; |
| 730 | EXPECT_THAT(classifier->Annotate("853 225 3556", options), |
| 731 | ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")})); |
| 732 | EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty()); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 733 | |
| 734 | // Try passing invalid utf8. |
| 735 | EXPECT_TRUE( |
| 736 | classifier->Annotate("853 225 3556\n\xf0\x9f\x98\x8b\x8b", options) |
| 737 | .empty()); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 738 | } |
| 739 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 740 | TEST_P(TextClassifierTest, AnnotateSmallBatches) { |
| 741 | CREATE_UNILIB_FOR_TESTING; |
| 742 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 743 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 744 | |
| 745 | // Set the batch size. |
| 746 | unpacked_model->selection_options->batch_size = 4; |
| 747 | flatbuffers::FlatBufferBuilder builder; |
| 748 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 749 | |
| 750 | std::unique_ptr<TextClassifier> classifier = |
| 751 | TextClassifier::FromUnownedBuffer( |
| 752 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 753 | builder.GetSize(), &unilib); |
| 754 | ASSERT_TRUE(classifier); |
| 755 | |
| 756 | const std::string test_string = |
| 757 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 758 | "number is 853 225 3556"; |
| 759 | EXPECT_THAT(classifier->Annotate(test_string), |
| 760 | ElementsAreArray({ |
| 761 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 762 | IsAnnotatedSpan(19, 24, "date"), |
| 763 | #endif |
| 764 | IsAnnotatedSpan(28, 55, "address"), |
| 765 | IsAnnotatedSpan(79, 91, "phone"), |
| 766 | })); |
| 767 | |
| 768 | AnnotationOptions options; |
| 769 | EXPECT_THAT(classifier->Annotate("853 225 3556", options), |
| 770 | ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")})); |
| 771 | EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty()); |
| 772 | } |
| 773 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 774 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 775 | TEST_P(TextClassifierTest, AnnotateFilteringDiscardAll) { |
| 776 | CREATE_UNILIB_FOR_TESTING; |
| 777 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 778 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 779 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 780 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 781 | // Add test threshold. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 782 | unpacked_model->triggering_options->min_annotate_confidence = |
| 783 | 2.f; // Discards all results. |
| 784 | flatbuffers::FlatBufferBuilder builder; |
| 785 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 786 | |
| 787 | std::unique_ptr<TextClassifier> classifier = |
| 788 | TextClassifier::FromUnownedBuffer( |
| 789 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 790 | builder.GetSize(), &unilib); |
| 791 | ASSERT_TRUE(classifier); |
| 792 | |
| 793 | const std::string test_string = |
| 794 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 795 | "number is 853 225 3556"; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 796 | |
| 797 | EXPECT_EQ(classifier->Annotate(test_string).size(), 1); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 798 | } |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 799 | #endif |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 800 | |
| 801 | TEST_P(TextClassifierTest, AnnotateFilteringKeepAll) { |
| 802 | CREATE_UNILIB_FOR_TESTING; |
| 803 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 804 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 805 | |
| 806 | // Add test thresholds. |
| 807 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 808 | unpacked_model->triggering_options->min_annotate_confidence = |
| 809 | 0.f; // Keeps all results. |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 810 | unpacked_model->triggering_options->enabled_modes = ModeFlag_ALL; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 811 | flatbuffers::FlatBufferBuilder builder; |
| 812 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 813 | |
| 814 | std::unique_ptr<TextClassifier> classifier = |
| 815 | TextClassifier::FromUnownedBuffer( |
| 816 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 817 | builder.GetSize(), &unilib); |
| 818 | ASSERT_TRUE(classifier); |
| 819 | |
| 820 | const std::string test_string = |
| 821 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 822 | "number is 853 225 3556"; |
| 823 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 824 | EXPECT_EQ(classifier->Annotate(test_string).size(), 3); |
| 825 | #else |
| 826 | // In non-ICU mode there is no "date" result. |
| 827 | EXPECT_EQ(classifier->Annotate(test_string).size(), 2); |
| 828 | #endif |
| 829 | } |
| 830 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 831 | TEST_P(TextClassifierTest, AnnotateDisabled) { |
| 832 | CREATE_UNILIB_FOR_TESTING; |
| 833 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 834 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 835 | |
| 836 | // Disable the model for annotation. |
| 837 | unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION_AND_SELECTION; |
| 838 | flatbuffers::FlatBufferBuilder builder; |
| 839 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 840 | |
| 841 | std::unique_ptr<TextClassifier> classifier = |
| 842 | TextClassifier::FromUnownedBuffer( |
| 843 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 844 | builder.GetSize(), &unilib); |
| 845 | ASSERT_TRUE(classifier); |
| 846 | const std::string test_string = |
| 847 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 848 | "number is 853 225 3556"; |
| 849 | EXPECT_THAT(classifier->Annotate(test_string), IsEmpty()); |
| 850 | } |
| 851 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 852 | TEST_P(TextClassifierTest, AnnotateFilteredCollections) { |
| 853 | CREATE_UNILIB_FOR_TESTING; |
| 854 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 855 | |
| 856 | std::unique_ptr<TextClassifier> classifier = |
| 857 | TextClassifier::FromUnownedBuffer(test_model.c_str(), test_model.size(), |
| 858 | &unilib); |
| 859 | ASSERT_TRUE(classifier); |
| 860 | |
| 861 | const std::string test_string = |
| 862 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 863 | "number is 853 225 3556"; |
| 864 | |
| 865 | EXPECT_THAT(classifier->Annotate(test_string), |
| 866 | ElementsAreArray({ |
| 867 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 868 | IsAnnotatedSpan(19, 24, "date"), |
| 869 | #endif |
| 870 | IsAnnotatedSpan(28, 55, "address"), |
| 871 | IsAnnotatedSpan(79, 91, "phone"), |
| 872 | })); |
| 873 | |
| 874 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 875 | unpacked_model->output_options.reset(new OutputOptionsT); |
| 876 | |
| 877 | // Disable phone annotation |
| 878 | unpacked_model->output_options->filtered_collections_annotation.push_back( |
| 879 | "phone"); |
| 880 | |
| 881 | flatbuffers::FlatBufferBuilder builder; |
| 882 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 883 | |
| 884 | classifier = TextClassifier::FromUnownedBuffer( |
| 885 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 886 | builder.GetSize(), &unilib); |
| 887 | ASSERT_TRUE(classifier); |
| 888 | |
| 889 | EXPECT_THAT(classifier->Annotate(test_string), |
| 890 | ElementsAreArray({ |
| 891 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 892 | IsAnnotatedSpan(19, 24, "date"), |
| 893 | #endif |
| 894 | IsAnnotatedSpan(28, 55, "address"), |
| 895 | })); |
| 896 | } |
| 897 | |
| 898 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 899 | TEST_P(TextClassifierTest, AnnotateFilteredCollectionsSuppress) { |
| 900 | CREATE_UNILIB_FOR_TESTING; |
| 901 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 902 | |
| 903 | std::unique_ptr<TextClassifier> classifier = |
| 904 | TextClassifier::FromUnownedBuffer(test_model.c_str(), test_model.size(), |
| 905 | &unilib); |
| 906 | ASSERT_TRUE(classifier); |
| 907 | |
| 908 | const std::string test_string = |
| 909 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 910 | "number is 853 225 3556"; |
| 911 | |
| 912 | EXPECT_THAT(classifier->Annotate(test_string), |
| 913 | ElementsAreArray({ |
| 914 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 915 | IsAnnotatedSpan(19, 24, "date"), |
| 916 | #endif |
| 917 | IsAnnotatedSpan(28, 55, "address"), |
| 918 | IsAnnotatedSpan(79, 91, "phone"), |
| 919 | })); |
| 920 | |
| 921 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 922 | unpacked_model->output_options.reset(new OutputOptionsT); |
| 923 | |
| 924 | // We add a custom annotator that wins against the phone classification |
| 925 | // below and that we subsequently suppress. |
| 926 | unpacked_model->output_options->filtered_collections_annotation.push_back( |
| 927 | "suppress"); |
| 928 | |
| 929 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 930 | "suppress", "(\\d{3} ?\\d{4})", |
| 931 | /*enabled_for_classification=*/false, |
| 932 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 2.0)); |
| 933 | |
| 934 | flatbuffers::FlatBufferBuilder builder; |
| 935 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 936 | |
| 937 | classifier = TextClassifier::FromUnownedBuffer( |
| 938 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 939 | builder.GetSize(), &unilib); |
| 940 | ASSERT_TRUE(classifier); |
| 941 | |
| 942 | EXPECT_THAT(classifier->Annotate(test_string), |
| 943 | ElementsAreArray({ |
| 944 | IsAnnotatedSpan(19, 24, "date"), |
| 945 | IsAnnotatedSpan(28, 55, "address"), |
| 946 | })); |
| 947 | } |
| 948 | #endif |
| 949 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 950 | #ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU |
| 951 | TEST_P(TextClassifierTest, ClassifyTextDate) { |
| 952 | std::unique_ptr<TextClassifier> classifier = |
| 953 | TextClassifier::FromPath(GetModelPath() + GetParam()); |
| 954 | EXPECT_TRUE(classifier); |
| 955 | |
| 956 | std::vector<ClassificationResult> result; |
| 957 | ClassificationOptions options; |
| 958 | |
| 959 | options.reference_timezone = "Europe/Zurich"; |
| 960 | result = classifier->ClassifyText("january 1, 2017", {0, 15}, options); |
| 961 | |
| 962 | ASSERT_EQ(result.size(), 1); |
| 963 | EXPECT_THAT(result[0].collection, "date"); |
| 964 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000); |
| 965 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 966 | DatetimeGranularity::GRANULARITY_DAY); |
| 967 | result.clear(); |
| 968 | |
| 969 | options.reference_timezone = "America/Los_Angeles"; |
| 970 | result = classifier->ClassifyText("march 1, 2017", {0, 13}, options); |
| 971 | ASSERT_EQ(result.size(), 1); |
| 972 | EXPECT_THAT(result[0].collection, "date"); |
| 973 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1488355200000); |
| 974 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 975 | DatetimeGranularity::GRANULARITY_DAY); |
| 976 | result.clear(); |
| 977 | |
| 978 | options.reference_timezone = "America/Los_Angeles"; |
| 979 | result = classifier->ClassifyText("2018/01/01 10:30:20", {0, 19}, options); |
| 980 | ASSERT_EQ(result.size(), 1); |
| 981 | EXPECT_THAT(result[0].collection, "date"); |
| 982 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1514831420000); |
| 983 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 984 | DatetimeGranularity::GRANULARITY_SECOND); |
| 985 | result.clear(); |
| 986 | |
| 987 | // Date on another line. |
| 988 | options.reference_timezone = "Europe/Zurich"; |
| 989 | result = classifier->ClassifyText( |
| 990 | "hello world this is the first line\n" |
| 991 | "january 1, 2017", |
| 992 | {35, 50}, options); |
| 993 | ASSERT_EQ(result.size(), 1); |
| 994 | EXPECT_THAT(result[0].collection, "date"); |
| 995 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000); |
| 996 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 997 | DatetimeGranularity::GRANULARITY_DAY); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 998 | } |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 999 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1000 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 1001 | #ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU |
| 1002 | TEST_P(TextClassifierTest, ClassifyTextDatePriorities) { |
| 1003 | std::unique_ptr<TextClassifier> classifier = |
| 1004 | TextClassifier::FromPath(GetModelPath() + GetParam()); |
| 1005 | EXPECT_TRUE(classifier); |
| 1006 | |
| 1007 | std::vector<ClassificationResult> result; |
| 1008 | ClassificationOptions options; |
| 1009 | |
| 1010 | result.clear(); |
| 1011 | options.reference_timezone = "Europe/Zurich"; |
| 1012 | options.locales = "en-US"; |
| 1013 | result = classifier->ClassifyText("03/05", {0, 5}, options); |
| 1014 | |
| 1015 | ASSERT_EQ(result.size(), 1); |
| 1016 | EXPECT_THAT(result[0].collection, "date"); |
| 1017 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 5439600000); |
| 1018 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 1019 | DatetimeGranularity::GRANULARITY_DAY); |
| 1020 | |
| 1021 | result.clear(); |
| 1022 | options.reference_timezone = "Europe/Zurich"; |
| 1023 | options.locales = "en-GB,en-US"; |
| 1024 | result = classifier->ClassifyText("03/05", {0, 5}, options); |
| 1025 | |
| 1026 | ASSERT_EQ(result.size(), 1); |
| 1027 | EXPECT_THAT(result[0].collection, "date"); |
| 1028 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 10537200000); |
| 1029 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 1030 | DatetimeGranularity::GRANULARITY_DAY); |
| 1031 | } |
| 1032 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1033 | |
| 1034 | #ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1035 | TEST_P(TextClassifierTest, SuggestTextDateDisabled) { |
| 1036 | CREATE_UNILIB_FOR_TESTING; |
| 1037 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 1038 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 1039 | |
| 1040 | // Disable the patterns for selection. |
| 1041 | for (int i = 0; i < unpacked_model->datetime_model->patterns.size(); i++) { |
| 1042 | unpacked_model->datetime_model->patterns[i]->enabled_modes = |
| 1043 | ModeFlag_ANNOTATION_AND_CLASSIFICATION; |
| 1044 | } |
| 1045 | flatbuffers::FlatBufferBuilder builder; |
| 1046 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 1047 | |
| 1048 | std::unique_ptr<TextClassifier> classifier = |
| 1049 | TextClassifier::FromUnownedBuffer( |
| 1050 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 1051 | builder.GetSize(), &unilib); |
| 1052 | ASSERT_TRUE(classifier); |
| 1053 | EXPECT_EQ("date", |
| 1054 | FirstResult(classifier->ClassifyText("january 1, 2017", {0, 15}))); |
| 1055 | EXPECT_EQ(classifier->SuggestSelection("january 1, 2017", {0, 7}), |
| 1056 | std::make_pair(0, 7)); |
| 1057 | EXPECT_THAT(classifier->Annotate("january 1, 2017"), |
| 1058 | ElementsAreArray({IsAnnotatedSpan(0, 15, "date")})); |
| 1059 | } |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1060 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1061 | |
| 1062 | class TestingTextClassifier : public TextClassifier { |
| 1063 | public: |
| 1064 | TestingTextClassifier(const std::string& model, const UniLib* unilib) |
| 1065 | : TextClassifier(ViewModel(model.data(), model.size()), unilib) {} |
| 1066 | |
| 1067 | using TextClassifier::ResolveConflicts; |
| 1068 | }; |
| 1069 | |
| 1070 | AnnotatedSpan MakeAnnotatedSpan(CodepointSpan span, |
| 1071 | const std::string& collection, |
| 1072 | const float score) { |
| 1073 | AnnotatedSpan result; |
| 1074 | result.span = span; |
| 1075 | result.classification.push_back({collection, score}); |
| 1076 | return result; |
| 1077 | } |
| 1078 | |
| 1079 | TEST(TextClassifierTest, ResolveConflictsTrivial) { |
| 1080 | CREATE_UNILIB_FOR_TESTING; |
| 1081 | TestingTextClassifier classifier("", &unilib); |
| 1082 | |
| 1083 | std::vector<AnnotatedSpan> candidates{ |
| 1084 | {MakeAnnotatedSpan({0, 1}, "phone", 1.0)}}; |
| 1085 | |
| 1086 | std::vector<int> chosen; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 1087 | classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{}, |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1088 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1089 | EXPECT_THAT(chosen, ElementsAreArray({0})); |
| 1090 | } |
| 1091 | |
| 1092 | TEST(TextClassifierTest, ResolveConflictsSequence) { |
| 1093 | CREATE_UNILIB_FOR_TESTING; |
| 1094 | TestingTextClassifier classifier("", &unilib); |
| 1095 | |
| 1096 | std::vector<AnnotatedSpan> candidates{{ |
| 1097 | MakeAnnotatedSpan({0, 1}, "phone", 1.0), |
| 1098 | MakeAnnotatedSpan({1, 2}, "phone", 1.0), |
| 1099 | MakeAnnotatedSpan({2, 3}, "phone", 1.0), |
| 1100 | MakeAnnotatedSpan({3, 4}, "phone", 1.0), |
| 1101 | MakeAnnotatedSpan({4, 5}, "phone", 1.0), |
| 1102 | }}; |
| 1103 | |
| 1104 | std::vector<int> chosen; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 1105 | classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{}, |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1106 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1107 | EXPECT_THAT(chosen, ElementsAreArray({0, 1, 2, 3, 4})); |
| 1108 | } |
| 1109 | |
| 1110 | TEST(TextClassifierTest, ResolveConflictsThreeSpans) { |
| 1111 | CREATE_UNILIB_FOR_TESTING; |
| 1112 | TestingTextClassifier classifier("", &unilib); |
| 1113 | |
| 1114 | std::vector<AnnotatedSpan> candidates{{ |
| 1115 | MakeAnnotatedSpan({0, 3}, "phone", 1.0), |
| 1116 | MakeAnnotatedSpan({1, 5}, "phone", 0.5), // Looser! |
| 1117 | MakeAnnotatedSpan({3, 7}, "phone", 1.0), |
| 1118 | }}; |
| 1119 | |
| 1120 | std::vector<int> chosen; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 1121 | classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{}, |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1122 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1123 | EXPECT_THAT(chosen, ElementsAreArray({0, 2})); |
| 1124 | } |
| 1125 | |
| 1126 | TEST(TextClassifierTest, ResolveConflictsThreeSpansReversed) { |
| 1127 | CREATE_UNILIB_FOR_TESTING; |
| 1128 | TestingTextClassifier classifier("", &unilib); |
| 1129 | |
| 1130 | std::vector<AnnotatedSpan> candidates{{ |
| 1131 | MakeAnnotatedSpan({0, 3}, "phone", 0.5), // Looser! |
| 1132 | MakeAnnotatedSpan({1, 5}, "phone", 1.0), |
| 1133 | MakeAnnotatedSpan({3, 7}, "phone", 0.6), // Looser! |
| 1134 | }}; |
| 1135 | |
| 1136 | std::vector<int> chosen; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 1137 | classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{}, |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1138 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1139 | EXPECT_THAT(chosen, ElementsAreArray({1})); |
| 1140 | } |
| 1141 | |
| 1142 | TEST(TextClassifierTest, ResolveConflictsFiveSpans) { |
| 1143 | CREATE_UNILIB_FOR_TESTING; |
| 1144 | TestingTextClassifier classifier("", &unilib); |
| 1145 | |
| 1146 | std::vector<AnnotatedSpan> candidates{{ |
| 1147 | MakeAnnotatedSpan({0, 3}, "phone", 0.5), |
| 1148 | MakeAnnotatedSpan({1, 5}, "other", 1.0), // Looser! |
| 1149 | MakeAnnotatedSpan({3, 7}, "phone", 0.6), |
| 1150 | MakeAnnotatedSpan({8, 12}, "phone", 0.6), // Looser! |
| 1151 | MakeAnnotatedSpan({11, 15}, "phone", 0.9), |
| 1152 | }}; |
| 1153 | |
| 1154 | std::vector<int> chosen; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 1155 | classifier.ResolveConflicts(candidates, /*context=*/"", /*cached_tokens=*/{}, |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1156 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1157 | EXPECT_THAT(chosen, ElementsAreArray({0, 2, 4})); |
| 1158 | } |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 1159 | |
Lukas Zilka | df710db | 2018-02-27 12:44:09 +0100 | [diff] [blame] | 1160 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1161 | TEST_P(TextClassifierTest, LongInput) { |
| 1162 | CREATE_UNILIB_FOR_TESTING; |
| 1163 | std::unique_ptr<TextClassifier> classifier = |
| 1164 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
| 1165 | ASSERT_TRUE(classifier); |
| 1166 | |
| 1167 | for (const auto& type_value_pair : |
| 1168 | std::vector<std::pair<std::string, std::string>>{ |
| 1169 | {"address", "350 Third Street, Cambridge"}, |
| 1170 | {"phone", "123 456-7890"}, |
| 1171 | {"url", "www.google.com"}, |
| 1172 | {"email", "someone@gmail.com"}, |
| 1173 | {"flight", "LX 38"}, |
| 1174 | {"date", "September 1, 2018"}}) { |
| 1175 | const std::string input_100k = std::string(50000, ' ') + |
| 1176 | type_value_pair.second + |
| 1177 | std::string(50000, ' '); |
| 1178 | const int value_length = type_value_pair.second.size(); |
| 1179 | |
| 1180 | EXPECT_THAT(classifier->Annotate(input_100k), |
| 1181 | ElementsAreArray({IsAnnotatedSpan(50000, 50000 + value_length, |
| 1182 | type_value_pair.first)})); |
| 1183 | EXPECT_EQ(classifier->SuggestSelection(input_100k, {50000, 50001}), |
| 1184 | std::make_pair(50000, 50000 + value_length)); |
| 1185 | EXPECT_EQ(type_value_pair.first, |
| 1186 | FirstResult(classifier->ClassifyText( |
| 1187 | input_100k, {50000, 50000 + value_length}))); |
| 1188 | } |
| 1189 | } |
| 1190 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1191 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame] | 1192 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1193 | // These coarse tests are there only to make sure the execution happens in |
| 1194 | // reasonable amount of time. |
| 1195 | TEST_P(TextClassifierTest, LongInputNoResultCheck) { |
| 1196 | CREATE_UNILIB_FOR_TESTING; |
| 1197 | std::unique_ptr<TextClassifier> classifier = |
| 1198 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
| 1199 | ASSERT_TRUE(classifier); |
| 1200 | |
| 1201 | for (const std::string& value : |
| 1202 | std::vector<std::string>{"http://www.aaaaaaaaaaaaaaaaaaaa.com "}) { |
| 1203 | const std::string input_100k = |
| 1204 | std::string(50000, ' ') + value + std::string(50000, ' '); |
| 1205 | const int value_length = value.size(); |
| 1206 | |
| 1207 | classifier->Annotate(input_100k); |
| 1208 | classifier->SuggestSelection(input_100k, {50000, 50001}); |
| 1209 | classifier->ClassifyText(input_100k, {50000, 50000 + value_length}); |
| 1210 | } |
| 1211 | } |
| 1212 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1213 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame^] | 1214 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1215 | TEST_P(TextClassifierTest, MinAddressTokenLength) { |
| 1216 | CREATE_UNILIB_FOR_TESTING; |
| 1217 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 1218 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 1219 | |
| 1220 | std::unique_ptr<TextClassifier> classifier; |
| 1221 | |
| 1222 | // With unrestricted number of address tokens should behave normally. |
| 1223 | unpacked_model->classification_options->address_min_num_tokens = 0; |
| 1224 | |
| 1225 | flatbuffers::FlatBufferBuilder builder; |
| 1226 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 1227 | classifier = TextClassifier::FromUnownedBuffer( |
| 1228 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 1229 | builder.GetSize(), &unilib); |
| 1230 | ASSERT_TRUE(classifier); |
| 1231 | |
| 1232 | EXPECT_EQ(FirstResult(classifier->ClassifyText( |
| 1233 | "I live at 350 Third Street, Cambridge.", {10, 37})), |
| 1234 | "address"); |
| 1235 | |
| 1236 | // Raise number of address tokens to suppress the address classification. |
| 1237 | unpacked_model->classification_options->address_min_num_tokens = 5; |
| 1238 | |
| 1239 | flatbuffers::FlatBufferBuilder builder2; |
| 1240 | builder2.Finish(Model::Pack(builder2, unpacked_model.get())); |
| 1241 | classifier = TextClassifier::FromUnownedBuffer( |
| 1242 | reinterpret_cast<const char*>(builder2.GetBufferPointer()), |
| 1243 | builder2.GetSize(), &unilib); |
| 1244 | ASSERT_TRUE(classifier); |
| 1245 | |
| 1246 | EXPECT_EQ(FirstResult(classifier->ClassifyText( |
| 1247 | "I live at 350 Third Street, Cambridge.", {10, 37})), |
| 1248 | "other"); |
| 1249 | } |
| 1250 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 1251 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 1252 | } // namespace |
| 1253 | } // namespace libtextclassifier2 |