Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2017 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include "text-classifier.h" |
| 18 | |
| 19 | #include <fstream> |
| 20 | #include <iostream> |
| 21 | #include <memory> |
| 22 | #include <string> |
| 23 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 24 | #include "model_generated.h" |
| 25 | #include "types-test-util.h" |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 26 | #include "gmock/gmock.h" |
| 27 | #include "gtest/gtest.h" |
| 28 | |
| 29 | namespace libtextclassifier2 { |
| 30 | namespace { |
| 31 | |
| 32 | using testing::ElementsAreArray; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 33 | using testing::IsEmpty; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 34 | using testing::Pair; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 35 | using testing::Values; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 36 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 37 | std::string FirstResult(const std::vector<ClassificationResult>& results) { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 38 | if (results.empty()) { |
| 39 | return "<INVALID RESULTS>"; |
| 40 | } |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 41 | return results[0].collection; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 42 | } |
| 43 | |
| 44 | MATCHER_P3(IsAnnotatedSpan, start, end, best_class, "") { |
| 45 | return testing::Value(arg.span, Pair(start, end)) && |
| 46 | testing::Value(FirstResult(arg.classification), best_class); |
| 47 | } |
| 48 | |
| 49 | std::string ReadFile(const std::string& file_name) { |
| 50 | std::ifstream file_stream(file_name); |
| 51 | return std::string(std::istreambuf_iterator<char>(file_stream), {}); |
| 52 | } |
| 53 | |
| 54 | std::string GetModelPath() { |
| 55 | return LIBTEXTCLASSIFIER_TEST_DATA_DIR; |
| 56 | } |
| 57 | |
| 58 | TEST(TextClassifierTest, EmbeddingExecutorLoadingFails) { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 59 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 60 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 61 | TextClassifier::FromPath(GetModelPath() + "wrong_embeddings.fb", &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 62 | EXPECT_FALSE(classifier); |
| 63 | } |
| 64 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 65 | class TextClassifierTest : public ::testing::TestWithParam<const char*> {}; |
| 66 | |
| 67 | INSTANTIATE_TEST_CASE_P(ClickContext, TextClassifierTest, |
| 68 | Values("test_model_cc.fb")); |
| 69 | INSTANTIATE_TEST_CASE_P(BoundsSensitive, TextClassifierTest, |
| 70 | Values("test_model.fb")); |
| 71 | |
| 72 | TEST_P(TextClassifierTest, ClassifyText) { |
| 73 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 74 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 75 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 76 | ASSERT_TRUE(classifier); |
| 77 | |
| 78 | EXPECT_EQ("other", |
| 79 | FirstResult(classifier->ClassifyText( |
| 80 | "this afternoon Barack Obama gave a speech at", {15, 27}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 81 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 82 | "Call me at (800) 123-456 today", {11, 24}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 83 | |
| 84 | // More lines. |
| 85 | EXPECT_EQ("other", |
| 86 | FirstResult(classifier->ClassifyText( |
| 87 | "this afternoon Barack Obama gave a speech at|Visit " |
| 88 | "www.google.com every today!|Call me at (800) 123-456 today.", |
| 89 | {15, 27}))); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 90 | EXPECT_EQ("phone", |
| 91 | FirstResult(classifier->ClassifyText( |
| 92 | "this afternoon Barack Obama gave a speech at|Visit " |
| 93 | "www.google.com every today!|Call me at (800) 123-456 today.", |
| 94 | {90, 103}))); |
| 95 | |
| 96 | // Single word. |
| 97 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText("obama", {0, 5}))); |
| 98 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText("asdf", {0, 4}))); |
| 99 | EXPECT_EQ("<INVALID RESULTS>", |
| 100 | FirstResult(classifier->ClassifyText("asdf", {0, 0}))); |
| 101 | |
| 102 | // Junk. |
| 103 | EXPECT_EQ("<INVALID RESULTS>", |
| 104 | FirstResult(classifier->ClassifyText("", {0, 0}))); |
| 105 | EXPECT_EQ("<INVALID RESULTS>", FirstResult(classifier->ClassifyText( |
| 106 | "a\n\n\n\nx x x\n\n\n\n\n\n", {1, 5}))); |
| 107 | } |
| 108 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 109 | TEST_P(TextClassifierTest, ClassifyTextDisabledFail) { |
| 110 | CREATE_UNILIB_FOR_TESTING; |
| 111 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 112 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 113 | |
| 114 | unpacked_model->classification_model.clear(); |
| 115 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 116 | unpacked_model->triggering_options->enabled_modes = ModeFlag_SELECTION; |
| 117 | |
| 118 | flatbuffers::FlatBufferBuilder builder; |
| 119 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 120 | |
| 121 | std::unique_ptr<TextClassifier> classifier = |
| 122 | TextClassifier::FromUnownedBuffer( |
| 123 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 124 | builder.GetSize(), &unilib); |
| 125 | |
| 126 | // The classification model is still needed for selection scores. |
| 127 | ASSERT_FALSE(classifier); |
| 128 | } |
| 129 | |
| 130 | TEST_P(TextClassifierTest, ClassifyTextDisabled) { |
| 131 | CREATE_UNILIB_FOR_TESTING; |
| 132 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 133 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 134 | |
| 135 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 136 | unpacked_model->triggering_options->enabled_modes = |
| 137 | ModeFlag_ANNOTATION_AND_SELECTION; |
| 138 | |
| 139 | flatbuffers::FlatBufferBuilder builder; |
| 140 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 141 | |
| 142 | std::unique_ptr<TextClassifier> classifier = |
| 143 | TextClassifier::FromUnownedBuffer( |
| 144 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 145 | builder.GetSize(), &unilib); |
| 146 | ASSERT_TRUE(classifier); |
| 147 | |
| 148 | EXPECT_THAT( |
| 149 | classifier->ClassifyText("Call me at (800) 123-456 today", {11, 24}), |
| 150 | IsEmpty()); |
| 151 | } |
| 152 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 153 | std::unique_ptr<RegexModel_::PatternT> MakePattern( |
| 154 | const std::string& collection_name, const std::string& pattern, |
| 155 | const bool enabled_for_classification, const bool enabled_for_selection, |
| 156 | const bool enabled_for_annotation, const float score) { |
| 157 | std::unique_ptr<RegexModel_::PatternT> result(new RegexModel_::PatternT); |
| 158 | result->collection_name = collection_name; |
| 159 | result->pattern = pattern; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 160 | // We cannot directly operate with |= on the flag, so use an int here. |
| 161 | int enabled_modes = ModeFlag_NONE; |
| 162 | if (enabled_for_annotation) enabled_modes |= ModeFlag_ANNOTATION; |
| 163 | if (enabled_for_classification) enabled_modes |= ModeFlag_CLASSIFICATION; |
| 164 | if (enabled_for_selection) enabled_modes |= ModeFlag_SELECTION; |
| 165 | result->enabled_modes = static_cast<ModeFlag>(enabled_modes); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 166 | result->target_classification_score = score; |
| 167 | result->priority_score = score; |
| 168 | return result; |
| 169 | } |
| 170 | |
| 171 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 172 | TEST_P(TextClassifierTest, ClassifyTextRegularExpression) { |
| 173 | CREATE_UNILIB_FOR_TESTING; |
| 174 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 175 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 176 | |
| 177 | // Add test regex models. |
| 178 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 179 | "person", "Barack Obama", /*enabled_for_classification=*/true, |
| 180 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 1.0)); |
| 181 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 182 | "flight", "[a-zA-Z]{2}\\d{2,4}", /*enabled_for_classification=*/true, |
| 183 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/false, 0.5)); |
| 184 | |
| 185 | flatbuffers::FlatBufferBuilder builder; |
| 186 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 187 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 188 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 189 | TextClassifier::FromUnownedBuffer( |
| 190 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 191 | builder.GetSize(), &unilib); |
| 192 | ASSERT_TRUE(classifier); |
| 193 | |
| 194 | EXPECT_EQ("flight", |
| 195 | FirstResult(classifier->ClassifyText( |
| 196 | "Your flight LX373 is delayed by 3 hours.", {12, 17}))); |
| 197 | EXPECT_EQ("person", |
| 198 | FirstResult(classifier->ClassifyText( |
| 199 | "this afternoon Barack Obama gave a speech at", {15, 27}))); |
| 200 | EXPECT_EQ("email", |
| 201 | FirstResult(classifier->ClassifyText("you@android.com", {0, 15}))); |
| 202 | EXPECT_EQ("email", FirstResult(classifier->ClassifyText( |
| 203 | "Contact me at you@android.com", {14, 29}))); |
| 204 | |
| 205 | EXPECT_EQ("url", FirstResult(classifier->ClassifyText( |
| 206 | "Visit www.google.com every today!", {6, 20}))); |
| 207 | |
| 208 | EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("LX 37", {0, 5}))); |
| 209 | EXPECT_EQ("flight", FirstResult(classifier->ClassifyText("flight LX 37 abcd", |
| 210 | {7, 12}))); |
| 211 | |
| 212 | // More lines. |
| 213 | EXPECT_EQ("url", |
| 214 | FirstResult(classifier->ClassifyText( |
| 215 | "this afternoon Barack Obama gave a speech at|Visit " |
| 216 | "www.google.com every today!|Call me at (800) 123-456 today.", |
| 217 | {51, 65}))); |
| 218 | } |
| 219 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 220 | |
| 221 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 222 | TEST_P(TextClassifierTest, SuggestSelectionRegularExpression) { |
| 223 | CREATE_UNILIB_FOR_TESTING; |
| 224 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 225 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 226 | |
| 227 | // Add test regex models. |
| 228 | unpacked_model->regex_model.reset(new RegexModelT); |
| 229 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 230 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 231 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 232 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 233 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 234 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 235 | unpacked_model->regex_model->patterns.back()->priority_score = 1.1; |
| 236 | |
| 237 | flatbuffers::FlatBufferBuilder builder; |
| 238 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 239 | |
| 240 | std::unique_ptr<TextClassifier> classifier = |
| 241 | TextClassifier::FromUnownedBuffer( |
| 242 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 243 | builder.GetSize(), &unilib); |
| 244 | ASSERT_TRUE(classifier); |
| 245 | |
| 246 | // Check regular expression selection. |
| 247 | EXPECT_EQ(classifier->SuggestSelection( |
| 248 | "Your flight MA 0123 is delayed by 3 hours.", {12, 14}), |
| 249 | std::make_pair(12, 19)); |
| 250 | EXPECT_EQ(classifier->SuggestSelection( |
| 251 | "this afternoon Barack Obama gave a speech at", {15, 21}), |
| 252 | std::make_pair(15, 27)); |
| 253 | } |
| 254 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 255 | |
| 256 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 257 | TEST_P(TextClassifierTest, |
| 258 | SuggestSelectionRegularExpressionConflictsModelWins) { |
| 259 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 260 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 261 | |
| 262 | // Add test regex models. |
| 263 | unpacked_model->regex_model.reset(new RegexModelT); |
| 264 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 265 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 266 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 267 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 268 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 269 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 270 | unpacked_model->regex_model->patterns.back()->priority_score = 0.5; |
| 271 | |
| 272 | flatbuffers::FlatBufferBuilder builder; |
| 273 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 274 | |
| 275 | std::unique_ptr<TextClassifier> classifier = |
| 276 | TextClassifier::FromUnownedBuffer( |
| 277 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 278 | builder.GetSize()); |
| 279 | ASSERT_TRUE(classifier); |
| 280 | |
| 281 | // Check conflict resolution. |
| 282 | EXPECT_EQ( |
| 283 | classifier->SuggestSelection( |
| 284 | "saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123", |
| 285 | {55, 57}), |
| 286 | std::make_pair(26, 62)); |
| 287 | } |
| 288 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 289 | |
| 290 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 291 | TEST_P(TextClassifierTest, |
| 292 | SuggestSelectionRegularExpressionConflictsRegexWins) { |
| 293 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 294 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 295 | |
| 296 | // Add test regex models. |
| 297 | unpacked_model->regex_model.reset(new RegexModelT); |
| 298 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 299 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 300 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 301 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 302 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 303 | /*enabled_for_selection=*/true, /*enabled_for_annotation=*/false, 1.0)); |
| 304 | unpacked_model->regex_model->patterns.back()->priority_score = 1.1; |
| 305 | |
| 306 | flatbuffers::FlatBufferBuilder builder; |
| 307 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 308 | |
| 309 | std::unique_ptr<TextClassifier> classifier = |
| 310 | TextClassifier::FromUnownedBuffer( |
| 311 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 312 | builder.GetSize()); |
| 313 | ASSERT_TRUE(classifier); |
| 314 | |
| 315 | // Check conflict resolution. |
| 316 | EXPECT_EQ( |
| 317 | classifier->SuggestSelection( |
| 318 | "saw Barack Obama today .. 350 Third Street, Cambridge, MA 0123", |
| 319 | {55, 57}), |
| 320 | std::make_pair(55, 62)); |
| 321 | } |
| 322 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 323 | |
| 324 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 325 | TEST_P(TextClassifierTest, AnnotateRegex) { |
| 326 | CREATE_UNILIB_FOR_TESTING; |
| 327 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 328 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 329 | |
| 330 | // Add test regex models. |
| 331 | unpacked_model->regex_model.reset(new RegexModelT); |
| 332 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 333 | "person", " (Barack Obama) ", /*enabled_for_classification=*/false, |
| 334 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 1.0)); |
| 335 | unpacked_model->regex_model->patterns.push_back(MakePattern( |
| 336 | "flight", "([a-zA-Z]{2} ?\\d{2,4})", /*enabled_for_classification=*/false, |
| 337 | /*enabled_for_selection=*/false, /*enabled_for_annotation=*/true, 0.5)); |
| 338 | flatbuffers::FlatBufferBuilder builder; |
| 339 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 340 | |
| 341 | std::unique_ptr<TextClassifier> classifier = |
| 342 | TextClassifier::FromUnownedBuffer( |
| 343 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 344 | builder.GetSize(), &unilib); |
| 345 | ASSERT_TRUE(classifier); |
| 346 | |
| 347 | const std::string test_string = |
| 348 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 349 | "number is 853 225 3556"; |
| 350 | EXPECT_THAT(classifier->Annotate(test_string), |
| 351 | ElementsAreArray({ |
| 352 | IsAnnotatedSpan(6, 18, "person"), |
| 353 | IsAnnotatedSpan(19, 24, "date"), |
| 354 | IsAnnotatedSpan(28, 55, "address"), |
| 355 | IsAnnotatedSpan(79, 91, "phone"), |
| 356 | })); |
| 357 | } |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 358 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 359 | |
| 360 | TEST_P(TextClassifierTest, PhoneFiltering) { |
| 361 | CREATE_UNILIB_FOR_TESTING; |
| 362 | std::unique_ptr<TextClassifier> classifier = |
| 363 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 364 | ASSERT_TRUE(classifier); |
| 365 | |
| 366 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 367 | "phone: (123) 456 789", {7, 20}))); |
| 368 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 369 | "phone: (123) 456 789,0001112", {7, 25}))); |
| 370 | EXPECT_EQ("other", FirstResult(classifier->ClassifyText( |
| 371 | "phone: (123) 456 789,0001112", {7, 28}))); |
| 372 | } |
| 373 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 374 | TEST_P(TextClassifierTest, SuggestSelection) { |
| 375 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 376 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 377 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 378 | ASSERT_TRUE(classifier); |
| 379 | |
| 380 | EXPECT_EQ(classifier->SuggestSelection( |
| 381 | "this afternoon Barack Obama gave a speech at", {15, 21}), |
| 382 | std::make_pair(15, 21)); |
| 383 | |
| 384 | // Try passing whole string. |
| 385 | // If more than 1 token is specified, we should return back what entered. |
| 386 | EXPECT_EQ( |
| 387 | classifier->SuggestSelection("350 Third Street, Cambridge", {0, 27}), |
| 388 | std::make_pair(0, 27)); |
| 389 | |
| 390 | // Single letter. |
| 391 | EXPECT_EQ(classifier->SuggestSelection("a", {0, 1}), std::make_pair(0, 1)); |
| 392 | |
| 393 | // Single word. |
| 394 | EXPECT_EQ(classifier->SuggestSelection("asdf", {0, 4}), std::make_pair(0, 4)); |
| 395 | |
| 396 | EXPECT_EQ( |
| 397 | classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}), |
| 398 | std::make_pair(11, 23)); |
| 399 | |
| 400 | // Unpaired bracket stripping. |
| 401 | EXPECT_EQ( |
| 402 | classifier->SuggestSelection("call me at (857) 225 3556 today", {11, 16}), |
| 403 | std::make_pair(11, 25)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 404 | EXPECT_EQ(classifier->SuggestSelection("call me at (857 today", {11, 15}), |
| 405 | std::make_pair(12, 15)); |
| 406 | EXPECT_EQ(classifier->SuggestSelection("call me at 3556) today", {11, 16}), |
| 407 | std::make_pair(11, 15)); |
| 408 | EXPECT_EQ(classifier->SuggestSelection("call me at )857( today", {11, 16}), |
| 409 | std::make_pair(12, 15)); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 410 | |
| 411 | // If the resulting selection would be empty, the original span is returned. |
| 412 | EXPECT_EQ(classifier->SuggestSelection("call me at )( today", {11, 13}), |
| 413 | std::make_pair(11, 13)); |
| 414 | EXPECT_EQ(classifier->SuggestSelection("call me at ( today", {11, 12}), |
| 415 | std::make_pair(11, 12)); |
| 416 | EXPECT_EQ(classifier->SuggestSelection("call me at ) today", {11, 12}), |
| 417 | std::make_pair(11, 12)); |
| 418 | } |
| 419 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 420 | TEST_P(TextClassifierTest, SuggestSelectionDisabledFail) { |
| 421 | CREATE_UNILIB_FOR_TESTING; |
| 422 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 423 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 424 | |
| 425 | // Disable the selection model. |
| 426 | unpacked_model->selection_model.clear(); |
| 427 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 428 | unpacked_model->triggering_options->enabled_modes = ModeFlag_ANNOTATION; |
| 429 | |
| 430 | flatbuffers::FlatBufferBuilder builder; |
| 431 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 432 | |
| 433 | std::unique_ptr<TextClassifier> classifier = |
| 434 | TextClassifier::FromUnownedBuffer( |
| 435 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 436 | builder.GetSize(), &unilib); |
| 437 | // Selection model needs to be present for annotation. |
| 438 | ASSERT_FALSE(classifier); |
| 439 | } |
| 440 | |
| 441 | TEST_P(TextClassifierTest, SuggestSelectionDisabled) { |
| 442 | CREATE_UNILIB_FOR_TESTING; |
| 443 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 444 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 445 | |
| 446 | // Disable the selection model. |
| 447 | unpacked_model->selection_model.clear(); |
| 448 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 449 | unpacked_model->triggering_options->enabled_modes = ModeFlag_CLASSIFICATION; |
| 450 | unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION; |
| 451 | |
| 452 | flatbuffers::FlatBufferBuilder builder; |
| 453 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 454 | |
| 455 | std::unique_ptr<TextClassifier> classifier = |
| 456 | TextClassifier::FromUnownedBuffer( |
| 457 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 458 | builder.GetSize(), &unilib); |
| 459 | ASSERT_TRUE(classifier); |
| 460 | |
| 461 | EXPECT_EQ( |
| 462 | classifier->SuggestSelection("call me at 857 225 3556 today", {11, 14}), |
| 463 | std::make_pair(11, 14)); |
| 464 | |
| 465 | EXPECT_EQ("phone", FirstResult(classifier->ClassifyText( |
| 466 | "call me at (800) 123-456 today", {11, 24}))); |
| 467 | |
| 468 | EXPECT_THAT(classifier->Annotate("call me at (800) 123-456 today"), |
| 469 | IsEmpty()); |
| 470 | } |
| 471 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 472 | TEST_P(TextClassifierTest, SuggestSelectionsAreSymmetric) { |
| 473 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 474 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 475 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 476 | ASSERT_TRUE(classifier); |
| 477 | |
| 478 | EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {0, 3}), |
| 479 | std::make_pair(0, 27)); |
| 480 | EXPECT_EQ(classifier->SuggestSelection("350 Third Street, Cambridge", {4, 9}), |
| 481 | std::make_pair(0, 27)); |
| 482 | EXPECT_EQ( |
| 483 | classifier->SuggestSelection("350 Third Street, Cambridge", {10, 16}), |
| 484 | std::make_pair(0, 27)); |
| 485 | EXPECT_EQ(classifier->SuggestSelection("a\nb\nc\n350 Third Street, Cambridge", |
| 486 | {16, 22}), |
| 487 | std::make_pair(6, 33)); |
| 488 | } |
| 489 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 490 | TEST_P(TextClassifierTest, SuggestSelectionWithNewLine) { |
| 491 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 492 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 493 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 494 | ASSERT_TRUE(classifier); |
| 495 | |
| 496 | EXPECT_EQ(classifier->SuggestSelection("abc\n857 225 3556", {4, 7}), |
| 497 | std::make_pair(4, 16)); |
| 498 | EXPECT_EQ(classifier->SuggestSelection("857 225 3556\nabc", {0, 3}), |
| 499 | std::make_pair(0, 12)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 500 | |
| 501 | SelectionOptions options; |
| 502 | EXPECT_EQ(classifier->SuggestSelection("857 225\n3556\nabc", {0, 3}, options), |
| 503 | std::make_pair(0, 7)); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 504 | } |
| 505 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 506 | TEST_P(TextClassifierTest, SuggestSelectionWithPunctuation) { |
| 507 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 508 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 509 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 510 | ASSERT_TRUE(classifier); |
| 511 | |
| 512 | // From the right. |
| 513 | EXPECT_EQ(classifier->SuggestSelection( |
| 514 | "this afternoon BarackObama, gave a speech at", {15, 26}), |
| 515 | std::make_pair(15, 26)); |
| 516 | |
| 517 | // From the right multiple. |
| 518 | EXPECT_EQ(classifier->SuggestSelection( |
| 519 | "this afternoon BarackObama,.,.,, gave a speech at", {15, 26}), |
| 520 | std::make_pair(15, 26)); |
| 521 | |
| 522 | // From the left multiple. |
| 523 | EXPECT_EQ(classifier->SuggestSelection( |
| 524 | "this afternoon ,.,.,,BarackObama gave a speech at", {21, 32}), |
| 525 | std::make_pair(21, 32)); |
| 526 | |
| 527 | // From both sides. |
| 528 | EXPECT_EQ(classifier->SuggestSelection( |
| 529 | "this afternoon !BarackObama,- gave a speech at", {16, 27}), |
| 530 | std::make_pair(16, 27)); |
| 531 | } |
| 532 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 533 | TEST_P(TextClassifierTest, SuggestSelectionNoCrashWithJunk) { |
| 534 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 535 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 536 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 537 | ASSERT_TRUE(classifier); |
| 538 | |
| 539 | // Try passing in bunch of invalid selections. |
| 540 | EXPECT_EQ(classifier->SuggestSelection("", {0, 27}), std::make_pair(0, 27)); |
| 541 | EXPECT_EQ(classifier->SuggestSelection("", {-10, 27}), |
| 542 | std::make_pair(-10, 27)); |
| 543 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {0, 27}), |
| 544 | std::make_pair(0, 27)); |
| 545 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-30, 300}), |
| 546 | std::make_pair(-30, 300)); |
| 547 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {-10, -1}), |
| 548 | std::make_pair(-10, -1)); |
| 549 | EXPECT_EQ(classifier->SuggestSelection("Word 1 2 3 hello!", {100, 17}), |
| 550 | std::make_pair(100, 17)); |
| 551 | } |
| 552 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 553 | TEST_P(TextClassifierTest, Annotate) { |
| 554 | CREATE_UNILIB_FOR_TESTING; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 555 | std::unique_ptr<TextClassifier> classifier = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 556 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 557 | ASSERT_TRUE(classifier); |
| 558 | |
| 559 | const std::string test_string = |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 560 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 561 | "number is 853 225 3556"; |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 562 | EXPECT_THAT(classifier->Annotate(test_string), |
| 563 | ElementsAreArray({ |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 564 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 565 | IsAnnotatedSpan(19, 24, "date"), |
| 566 | #endif |
| 567 | IsAnnotatedSpan(28, 55, "address"), |
| 568 | IsAnnotatedSpan(79, 91, "phone"), |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 569 | })); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 570 | |
| 571 | AnnotationOptions options; |
| 572 | EXPECT_THAT(classifier->Annotate("853 225 3556", options), |
| 573 | ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")})); |
| 574 | EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty()); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 575 | } |
| 576 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 577 | TEST_P(TextClassifierTest, AnnotateSmallBatches) { |
| 578 | CREATE_UNILIB_FOR_TESTING; |
| 579 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 580 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 581 | |
| 582 | // Set the batch size. |
| 583 | unpacked_model->selection_options->batch_size = 4; |
| 584 | flatbuffers::FlatBufferBuilder builder; |
| 585 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 586 | |
| 587 | std::unique_ptr<TextClassifier> classifier = |
| 588 | TextClassifier::FromUnownedBuffer( |
| 589 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 590 | builder.GetSize(), &unilib); |
| 591 | ASSERT_TRUE(classifier); |
| 592 | |
| 593 | const std::string test_string = |
| 594 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 595 | "number is 853 225 3556"; |
| 596 | EXPECT_THAT(classifier->Annotate(test_string), |
| 597 | ElementsAreArray({ |
| 598 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 599 | IsAnnotatedSpan(19, 24, "date"), |
| 600 | #endif |
| 601 | IsAnnotatedSpan(28, 55, "address"), |
| 602 | IsAnnotatedSpan(79, 91, "phone"), |
| 603 | })); |
| 604 | |
| 605 | AnnotationOptions options; |
| 606 | EXPECT_THAT(classifier->Annotate("853 225 3556", options), |
| 607 | ElementsAreArray({IsAnnotatedSpan(0, 12, "phone")})); |
| 608 | EXPECT_TRUE(classifier->Annotate("853 225\n3556", options).empty()); |
| 609 | } |
| 610 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 611 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 612 | TEST_P(TextClassifierTest, AnnotateFilteringDiscardAll) { |
| 613 | CREATE_UNILIB_FOR_TESTING; |
| 614 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 615 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 616 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 617 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 618 | // Add test threshold. |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 619 | unpacked_model->triggering_options->min_annotate_confidence = |
| 620 | 2.f; // Discards all results. |
| 621 | flatbuffers::FlatBufferBuilder builder; |
| 622 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 623 | |
| 624 | std::unique_ptr<TextClassifier> classifier = |
| 625 | TextClassifier::FromUnownedBuffer( |
| 626 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 627 | builder.GetSize(), &unilib); |
| 628 | ASSERT_TRUE(classifier); |
| 629 | |
| 630 | const std::string test_string = |
| 631 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 632 | "number is 853 225 3556"; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 633 | |
| 634 | EXPECT_EQ(classifier->Annotate(test_string).size(), 1); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 635 | } |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 636 | #endif |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 637 | |
| 638 | TEST_P(TextClassifierTest, AnnotateFilteringKeepAll) { |
| 639 | CREATE_UNILIB_FOR_TESTING; |
| 640 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 641 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 642 | |
| 643 | // Add test thresholds. |
| 644 | unpacked_model->triggering_options.reset(new ModelTriggeringOptionsT); |
| 645 | unpacked_model->triggering_options->min_annotate_confidence = |
| 646 | 0.f; // Keeps all results. |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 647 | unpacked_model->triggering_options->enabled_modes = ModeFlag_ALL; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 648 | flatbuffers::FlatBufferBuilder builder; |
| 649 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 650 | |
| 651 | std::unique_ptr<TextClassifier> classifier = |
| 652 | TextClassifier::FromUnownedBuffer( |
| 653 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 654 | builder.GetSize(), &unilib); |
| 655 | ASSERT_TRUE(classifier); |
| 656 | |
| 657 | const std::string test_string = |
| 658 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 659 | "number is 853 225 3556"; |
| 660 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 661 | EXPECT_EQ(classifier->Annotate(test_string).size(), 3); |
| 662 | #else |
| 663 | // In non-ICU mode there is no "date" result. |
| 664 | EXPECT_EQ(classifier->Annotate(test_string).size(), 2); |
| 665 | #endif |
| 666 | } |
| 667 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 668 | TEST_P(TextClassifierTest, AnnotateDisabled) { |
| 669 | CREATE_UNILIB_FOR_TESTING; |
| 670 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 671 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 672 | |
| 673 | // Disable the model for annotation. |
| 674 | unpacked_model->enabled_modes = ModeFlag_CLASSIFICATION_AND_SELECTION; |
| 675 | flatbuffers::FlatBufferBuilder builder; |
| 676 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 677 | |
| 678 | std::unique_ptr<TextClassifier> classifier = |
| 679 | TextClassifier::FromUnownedBuffer( |
| 680 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 681 | builder.GetSize(), &unilib); |
| 682 | ASSERT_TRUE(classifier); |
| 683 | const std::string test_string = |
| 684 | "& saw Barack Obama today .. 350 Third Street, Cambridge\nand my phone " |
| 685 | "number is 853 225 3556"; |
| 686 | EXPECT_THAT(classifier->Annotate(test_string), IsEmpty()); |
| 687 | } |
| 688 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 689 | #ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU |
| 690 | TEST_P(TextClassifierTest, ClassifyTextDate) { |
| 691 | std::unique_ptr<TextClassifier> classifier = |
| 692 | TextClassifier::FromPath(GetModelPath() + GetParam()); |
| 693 | EXPECT_TRUE(classifier); |
| 694 | |
| 695 | std::vector<ClassificationResult> result; |
| 696 | ClassificationOptions options; |
| 697 | |
| 698 | options.reference_timezone = "Europe/Zurich"; |
| 699 | result = classifier->ClassifyText("january 1, 2017", {0, 15}, options); |
| 700 | |
| 701 | ASSERT_EQ(result.size(), 1); |
| 702 | EXPECT_THAT(result[0].collection, "date"); |
| 703 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000); |
| 704 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 705 | DatetimeGranularity::GRANULARITY_DAY); |
| 706 | result.clear(); |
| 707 | |
| 708 | options.reference_timezone = "America/Los_Angeles"; |
| 709 | result = classifier->ClassifyText("march 1, 2017", {0, 13}, options); |
| 710 | ASSERT_EQ(result.size(), 1); |
| 711 | EXPECT_THAT(result[0].collection, "date"); |
| 712 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1488355200000); |
| 713 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 714 | DatetimeGranularity::GRANULARITY_DAY); |
| 715 | result.clear(); |
| 716 | |
| 717 | options.reference_timezone = "America/Los_Angeles"; |
| 718 | result = classifier->ClassifyText("2018/01/01 10:30:20", {0, 19}, options); |
| 719 | ASSERT_EQ(result.size(), 1); |
| 720 | EXPECT_THAT(result[0].collection, "date"); |
| 721 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1514831420000); |
| 722 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 723 | DatetimeGranularity::GRANULARITY_SECOND); |
| 724 | result.clear(); |
| 725 | |
| 726 | // Date on another line. |
| 727 | options.reference_timezone = "Europe/Zurich"; |
| 728 | result = classifier->ClassifyText( |
| 729 | "hello world this is the first line\n" |
| 730 | "january 1, 2017", |
| 731 | {35, 50}, options); |
| 732 | ASSERT_EQ(result.size(), 1); |
| 733 | EXPECT_THAT(result[0].collection, "date"); |
| 734 | EXPECT_EQ(result[0].datetime_parse_result.time_ms_utc, 1483225200000); |
| 735 | EXPECT_EQ(result[0].datetime_parse_result.granularity, |
| 736 | DatetimeGranularity::GRANULARITY_DAY); |
| 737 | result.clear(); |
| 738 | } |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 739 | |
| 740 | TEST_P(TextClassifierTest, SuggestTextDateDisabled) { |
| 741 | CREATE_UNILIB_FOR_TESTING; |
| 742 | const std::string test_model = ReadFile(GetModelPath() + GetParam()); |
| 743 | std::unique_ptr<ModelT> unpacked_model = UnPackModel(test_model.c_str()); |
| 744 | |
| 745 | // Disable the patterns for selection. |
| 746 | for (int i = 0; i < unpacked_model->datetime_model->patterns.size(); i++) { |
| 747 | unpacked_model->datetime_model->patterns[i]->enabled_modes = |
| 748 | ModeFlag_ANNOTATION_AND_CLASSIFICATION; |
| 749 | } |
| 750 | flatbuffers::FlatBufferBuilder builder; |
| 751 | builder.Finish(Model::Pack(builder, unpacked_model.get())); |
| 752 | |
| 753 | std::unique_ptr<TextClassifier> classifier = |
| 754 | TextClassifier::FromUnownedBuffer( |
| 755 | reinterpret_cast<const char*>(builder.GetBufferPointer()), |
| 756 | builder.GetSize(), &unilib); |
| 757 | ASSERT_TRUE(classifier); |
| 758 | EXPECT_EQ("date", |
| 759 | FirstResult(classifier->ClassifyText("january 1, 2017", {0, 15}))); |
| 760 | EXPECT_EQ(classifier->SuggestSelection("january 1, 2017", {0, 7}), |
| 761 | std::make_pair(0, 7)); |
| 762 | EXPECT_THAT(classifier->Annotate("january 1, 2017"), |
| 763 | ElementsAreArray({IsAnnotatedSpan(0, 15, "date")})); |
| 764 | } |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 765 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 766 | |
| 767 | class TestingTextClassifier : public TextClassifier { |
| 768 | public: |
| 769 | TestingTextClassifier(const std::string& model, const UniLib* unilib) |
| 770 | : TextClassifier(ViewModel(model.data(), model.size()), unilib) {} |
| 771 | |
| 772 | using TextClassifier::ResolveConflicts; |
| 773 | }; |
| 774 | |
| 775 | AnnotatedSpan MakeAnnotatedSpan(CodepointSpan span, |
| 776 | const std::string& collection, |
| 777 | const float score) { |
| 778 | AnnotatedSpan result; |
| 779 | result.span = span; |
| 780 | result.classification.push_back({collection, score}); |
| 781 | return result; |
| 782 | } |
| 783 | |
| 784 | TEST(TextClassifierTest, ResolveConflictsTrivial) { |
| 785 | CREATE_UNILIB_FOR_TESTING; |
| 786 | TestingTextClassifier classifier("", &unilib); |
| 787 | |
| 788 | std::vector<AnnotatedSpan> candidates{ |
| 789 | {MakeAnnotatedSpan({0, 1}, "phone", 1.0)}}; |
| 790 | |
| 791 | std::vector<int> chosen; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 792 | classifier.ResolveConflicts(candidates, /*context=*/"", |
| 793 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 794 | EXPECT_THAT(chosen, ElementsAreArray({0})); |
| 795 | } |
| 796 | |
| 797 | TEST(TextClassifierTest, ResolveConflictsSequence) { |
| 798 | CREATE_UNILIB_FOR_TESTING; |
| 799 | TestingTextClassifier classifier("", &unilib); |
| 800 | |
| 801 | std::vector<AnnotatedSpan> candidates{{ |
| 802 | MakeAnnotatedSpan({0, 1}, "phone", 1.0), |
| 803 | MakeAnnotatedSpan({1, 2}, "phone", 1.0), |
| 804 | MakeAnnotatedSpan({2, 3}, "phone", 1.0), |
| 805 | MakeAnnotatedSpan({3, 4}, "phone", 1.0), |
| 806 | MakeAnnotatedSpan({4, 5}, "phone", 1.0), |
| 807 | }}; |
| 808 | |
| 809 | std::vector<int> chosen; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 810 | classifier.ResolveConflicts(candidates, /*context=*/"", |
| 811 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 812 | EXPECT_THAT(chosen, ElementsAreArray({0, 1, 2, 3, 4})); |
| 813 | } |
| 814 | |
| 815 | TEST(TextClassifierTest, ResolveConflictsThreeSpans) { |
| 816 | CREATE_UNILIB_FOR_TESTING; |
| 817 | TestingTextClassifier classifier("", &unilib); |
| 818 | |
| 819 | std::vector<AnnotatedSpan> candidates{{ |
| 820 | MakeAnnotatedSpan({0, 3}, "phone", 1.0), |
| 821 | MakeAnnotatedSpan({1, 5}, "phone", 0.5), // Looser! |
| 822 | MakeAnnotatedSpan({3, 7}, "phone", 1.0), |
| 823 | }}; |
| 824 | |
| 825 | std::vector<int> chosen; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 826 | classifier.ResolveConflicts(candidates, /*context=*/"", |
| 827 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 828 | EXPECT_THAT(chosen, ElementsAreArray({0, 2})); |
| 829 | } |
| 830 | |
| 831 | TEST(TextClassifierTest, ResolveConflictsThreeSpansReversed) { |
| 832 | CREATE_UNILIB_FOR_TESTING; |
| 833 | TestingTextClassifier classifier("", &unilib); |
| 834 | |
| 835 | std::vector<AnnotatedSpan> candidates{{ |
| 836 | MakeAnnotatedSpan({0, 3}, "phone", 0.5), // Looser! |
| 837 | MakeAnnotatedSpan({1, 5}, "phone", 1.0), |
| 838 | MakeAnnotatedSpan({3, 7}, "phone", 0.6), // Looser! |
| 839 | }}; |
| 840 | |
| 841 | std::vector<int> chosen; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 842 | classifier.ResolveConflicts(candidates, /*context=*/"", |
| 843 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 844 | EXPECT_THAT(chosen, ElementsAreArray({1})); |
| 845 | } |
| 846 | |
| 847 | TEST(TextClassifierTest, ResolveConflictsFiveSpans) { |
| 848 | CREATE_UNILIB_FOR_TESTING; |
| 849 | TestingTextClassifier classifier("", &unilib); |
| 850 | |
| 851 | std::vector<AnnotatedSpan> candidates{{ |
| 852 | MakeAnnotatedSpan({0, 3}, "phone", 0.5), |
| 853 | MakeAnnotatedSpan({1, 5}, "other", 1.0), // Looser! |
| 854 | MakeAnnotatedSpan({3, 7}, "phone", 0.6), |
| 855 | MakeAnnotatedSpan({8, 12}, "phone", 0.6), // Looser! |
| 856 | MakeAnnotatedSpan({11, 15}, "phone", 0.9), |
| 857 | }}; |
| 858 | |
| 859 | std::vector<int> chosen; |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 860 | classifier.ResolveConflicts(candidates, /*context=*/"", |
| 861 | /*interpreter_manager=*/nullptr, &chosen); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 862 | EXPECT_THAT(chosen, ElementsAreArray({0, 2, 4})); |
| 863 | } |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 864 | |
Lukas Zilka | df710db | 2018-02-27 12:44:09 +0100 | [diff] [blame] | 865 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 866 | TEST_P(TextClassifierTest, LongInput) { |
| 867 | CREATE_UNILIB_FOR_TESTING; |
| 868 | std::unique_ptr<TextClassifier> classifier = |
| 869 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
| 870 | ASSERT_TRUE(classifier); |
| 871 | |
| 872 | for (const auto& type_value_pair : |
| 873 | std::vector<std::pair<std::string, std::string>>{ |
| 874 | {"address", "350 Third Street, Cambridge"}, |
| 875 | {"phone", "123 456-7890"}, |
| 876 | {"url", "www.google.com"}, |
| 877 | {"email", "someone@gmail.com"}, |
| 878 | {"flight", "LX 38"}, |
| 879 | {"date", "September 1, 2018"}}) { |
| 880 | const std::string input_100k = std::string(50000, ' ') + |
| 881 | type_value_pair.second + |
| 882 | std::string(50000, ' '); |
| 883 | const int value_length = type_value_pair.second.size(); |
| 884 | |
| 885 | EXPECT_THAT(classifier->Annotate(input_100k), |
| 886 | ElementsAreArray({IsAnnotatedSpan(50000, 50000 + value_length, |
| 887 | type_value_pair.first)})); |
| 888 | EXPECT_EQ(classifier->SuggestSelection(input_100k, {50000, 50001}), |
| 889 | std::make_pair(50000, 50000 + value_length)); |
| 890 | EXPECT_EQ(type_value_pair.first, |
| 891 | FirstResult(classifier->ClassifyText( |
| 892 | input_100k, {50000, 50000 + value_length}))); |
| 893 | } |
| 894 | } |
| 895 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 896 | |
Lukas Zilka | ba849e7 | 2018-03-08 14:48:21 +0100 | [diff] [blame^] | 897 | #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU |
| 898 | // These coarse tests are there only to make sure the execution happens in |
| 899 | // reasonable amount of time. |
| 900 | TEST_P(TextClassifierTest, LongInputNoResultCheck) { |
| 901 | CREATE_UNILIB_FOR_TESTING; |
| 902 | std::unique_ptr<TextClassifier> classifier = |
| 903 | TextClassifier::FromPath(GetModelPath() + GetParam(), &unilib); |
| 904 | ASSERT_TRUE(classifier); |
| 905 | |
| 906 | for (const std::string& value : |
| 907 | std::vector<std::string>{"http://www.aaaaaaaaaaaaaaaaaaaa.com "}) { |
| 908 | const std::string input_100k = |
| 909 | std::string(50000, ' ') + value + std::string(50000, ' '); |
| 910 | const int value_length = value.size(); |
| 911 | |
| 912 | classifier->Annotate(input_100k); |
| 913 | classifier->SuggestSelection(input_100k, {50000, 50001}); |
| 914 | classifier->ClassifyText(input_100k, {50000, 50000 + value_length}); |
| 915 | } |
| 916 | } |
| 917 | #endif // LIBTEXTCLASSIFIER_UNILIB_ICU |
| 918 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 919 | } // namespace |
| 920 | } // namespace libtextclassifier2 |