Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 1 | /* |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 2 | * Copyright (C) 2018 The Android Open Source Project |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include <time.h> |
| 18 | #include <fstream> |
| 19 | #include <iostream> |
| 20 | #include <memory> |
| 21 | #include <string> |
| 22 | |
| 23 | #include "gmock/gmock.h" |
| 24 | #include "gtest/gtest.h" |
| 25 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 26 | #include "annotator/annotator.h" |
| 27 | #include "annotator/datetime/parser.h" |
| 28 | #include "annotator/model_generated.h" |
| 29 | #include "annotator/types-test-util.h" |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 30 | |
| 31 | using testing::ElementsAreArray; |
| 32 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 33 | namespace libtextclassifier3 { |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 34 | namespace { |
| 35 | |
| 36 | std::string GetModelPath() { |
Tony Mak | a0f598b | 2018-11-20 20:39:04 +0000 | [diff] [blame^] | 37 | return TC3_TEST_DATA_DIR; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 38 | } |
| 39 | |
| 40 | std::string ReadFile(const std::string& file_name) { |
| 41 | std::ifstream file_stream(file_name); |
| 42 | return std::string(std::istreambuf_iterator<char>(file_stream), {}); |
| 43 | } |
| 44 | |
| 45 | std::string FormatMillis(int64 time_ms_utc) { |
| 46 | long time_seconds = time_ms_utc / 1000; // NOLINT |
| 47 | // Format time, "ddd yyyy-mm-dd hh:mm:ss zzz" |
| 48 | char buffer[512]; |
| 49 | strftime(buffer, sizeof(buffer), "%a %Y-%m-%d %H:%M:%S %Z", |
| 50 | localtime(&time_seconds)); |
| 51 | return std::string(buffer); |
| 52 | } |
| 53 | |
| 54 | class ParserTest : public testing::Test { |
| 55 | public: |
| 56 | void SetUp() override { |
| 57 | model_buffer_ = ReadFile(GetModelPath() + "test_model.fb"); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 58 | classifier_ = Annotator::FromUnownedBuffer(model_buffer_.data(), |
| 59 | model_buffer_.size(), &unilib_); |
| 60 | TC3_CHECK(classifier_); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 61 | parser_ = classifier_->DatetimeParserForTests(); |
| 62 | } |
| 63 | |
| 64 | bool HasNoResult(const std::string& text, bool anchor_start_end = false, |
| 65 | const std::string& timezone = "Europe/Zurich") { |
| 66 | std::vector<DatetimeParseResultSpan> results; |
| 67 | if (!parser_->Parse(text, 0, timezone, /*locales=*/"", ModeFlag_ANNOTATION, |
| 68 | anchor_start_end, &results)) { |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 69 | TC3_LOG(ERROR) << text; |
| 70 | TC3_CHECK(false); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 71 | } |
| 72 | return results.empty(); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 73 | } |
| 74 | |
| 75 | bool ParsesCorrectly(const std::string& marked_text, |
| 76 | const int64 expected_ms_utc, |
| 77 | DatetimeGranularity expected_granularity, |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 78 | bool anchor_start_end = false, |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 79 | const std::string& timezone = "Europe/Zurich", |
| 80 | const std::string& locales = "en-US") { |
| 81 | const UnicodeText marked_text_unicode = |
| 82 | UTF8ToUnicodeText(marked_text, /*do_copy=*/false); |
| 83 | auto brace_open_it = |
| 84 | std::find(marked_text_unicode.begin(), marked_text_unicode.end(), '{'); |
| 85 | auto brace_end_it = |
| 86 | std::find(marked_text_unicode.begin(), marked_text_unicode.end(), '}'); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 87 | TC3_CHECK(brace_open_it != marked_text_unicode.end()); |
| 88 | TC3_CHECK(brace_end_it != marked_text_unicode.end()); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 89 | |
| 90 | std::string text; |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 91 | text += |
| 92 | UnicodeText::UTF8Substring(marked_text_unicode.begin(), brace_open_it); |
| 93 | text += UnicodeText::UTF8Substring(std::next(brace_open_it), brace_end_it); |
| 94 | text += UnicodeText::UTF8Substring(std::next(brace_end_it), |
| 95 | marked_text_unicode.end()); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 96 | |
| 97 | std::vector<DatetimeParseResultSpan> results; |
| 98 | |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 99 | if (!parser_->Parse(text, 0, timezone, locales, ModeFlag_ANNOTATION, |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 100 | anchor_start_end, &results)) { |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 101 | TC3_LOG(ERROR) << text; |
| 102 | TC3_CHECK(false); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 103 | } |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 104 | if (results.empty()) { |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 105 | TC3_LOG(ERROR) << "No results."; |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 106 | return false; |
| 107 | } |
| 108 | |
| 109 | const int expected_start_index = |
| 110 | std::distance(marked_text_unicode.begin(), brace_open_it); |
| 111 | // The -1 bellow is to account for the opening bracket character. |
| 112 | const int expected_end_index = |
| 113 | std::distance(marked_text_unicode.begin(), brace_end_it) - 1; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 114 | |
| 115 | std::vector<DatetimeParseResultSpan> filtered_results; |
| 116 | for (const DatetimeParseResultSpan& result : results) { |
| 117 | if (SpansOverlap(result.span, |
| 118 | {expected_start_index, expected_end_index})) { |
| 119 | filtered_results.push_back(result); |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | const std::vector<DatetimeParseResultSpan> expected{ |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 124 | {{expected_start_index, expected_end_index}, |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 125 | {expected_ms_utc, expected_granularity}, |
| 126 | /*target_classification_score=*/1.0, |
Tony Mak | 51a9e54 | 2018-11-02 13:36:22 +0000 | [diff] [blame] | 127 | /*priority_score=*/0.1}}; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 128 | const bool matches = |
| 129 | testing::Matches(ElementsAreArray(expected))(filtered_results); |
| 130 | if (!matches) { |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 131 | TC3_LOG(ERROR) << "Expected: " << expected[0] << " which corresponds to: " |
| 132 | << FormatMillis(expected[0].data.time_ms_utc); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 133 | for (int i = 0; i < filtered_results.size(); ++i) { |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 134 | TC3_LOG(ERROR) << "Actual[" << i << "]: " << filtered_results[i] |
| 135 | << " which corresponds to: " |
| 136 | << FormatMillis(filtered_results[i].data.time_ms_utc); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 137 | } |
| 138 | } |
| 139 | return matches; |
| 140 | } |
| 141 | |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 142 | bool ParsesCorrectlyGerman(const std::string& marked_text, |
| 143 | const int64 expected_ms_utc, |
| 144 | DatetimeGranularity expected_granularity) { |
| 145 | return ParsesCorrectly(marked_text, expected_ms_utc, expected_granularity, |
| 146 | /*anchor_start_end=*/false, |
| 147 | /*timezone=*/"Europe/Zurich", /*locales=*/"de"); |
| 148 | } |
| 149 | |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 150 | protected: |
| 151 | std::string model_buffer_; |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 152 | std::unique_ptr<Annotator> classifier_; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 153 | const DatetimeParser* parser_; |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 154 | UniLib unilib_; |
| 155 | }; |
| 156 | |
| 157 | // Test with just a few cases to make debugging of general failures easier. |
| 158 | TEST_F(ParserTest, ParseShort) { |
| 159 | EXPECT_TRUE( |
| 160 | ParsesCorrectly("{January 1, 1988}", 567990000000, GRANULARITY_DAY)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 161 | } |
| 162 | |
| 163 | TEST_F(ParserTest, Parse) { |
| 164 | EXPECT_TRUE( |
| 165 | ParsesCorrectly("{January 1, 1988}", 567990000000, GRANULARITY_DAY)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 166 | EXPECT_TRUE( |
| 167 | ParsesCorrectly("{january 31 2018}", 1517353200000, GRANULARITY_DAY)); |
| 168 | EXPECT_TRUE(ParsesCorrectly("lorem {1 january 2018} ipsum", 1514761200000, |
| 169 | GRANULARITY_DAY)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 170 | EXPECT_TRUE(ParsesCorrectly("{09/Mar/2004 22:02:40}", 1078866160000, |
| 171 | GRANULARITY_SECOND)); |
| 172 | EXPECT_TRUE(ParsesCorrectly("{Dec 2, 2010 2:39:58 AM}", 1291253998000, |
| 173 | GRANULARITY_SECOND)); |
| 174 | EXPECT_TRUE(ParsesCorrectly("{Jun 09 2011 15:28:14}", 1307626094000, |
| 175 | GRANULARITY_SECOND)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 176 | EXPECT_TRUE( |
| 177 | ParsesCorrectly("{Mar 16 08:12:04}", 6419524000, GRANULARITY_SECOND)); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 178 | EXPECT_TRUE(ParsesCorrectly("{2010-06-26 02:31:29}", 1277512289000, |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 179 | GRANULARITY_SECOND)); |
| 180 | EXPECT_TRUE(ParsesCorrectly("{2006/01/22 04:11:05}", 1137899465000, |
| 181 | GRANULARITY_SECOND)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 182 | EXPECT_TRUE(ParsesCorrectly("{11:42:35}", 38555000, GRANULARITY_SECOND)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 183 | EXPECT_TRUE( |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 184 | ParsesCorrectly("{23/Apr 11:42:35}", 9715355000, GRANULARITY_SECOND)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 185 | EXPECT_TRUE(ParsesCorrectly("{23/Apr/2015 11:42:35}", 1429782155000, |
| 186 | GRANULARITY_SECOND)); |
| 187 | EXPECT_TRUE(ParsesCorrectly("{23-Apr-2015 11:42:35}", 1429782155000, |
| 188 | GRANULARITY_SECOND)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 189 | EXPECT_TRUE(ParsesCorrectly("{23 Apr 2015 11:42:35}", 1429782155000, |
| 190 | GRANULARITY_SECOND)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 191 | EXPECT_TRUE(ParsesCorrectly("{04/23/15 11:42:35}", 1429782155000, |
| 192 | GRANULARITY_SECOND)); |
| 193 | EXPECT_TRUE(ParsesCorrectly("{04/23/2015 11:42:35}", 1429782155000, |
| 194 | GRANULARITY_SECOND)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 195 | EXPECT_TRUE(ParsesCorrectly("{9/28/2011 2:23:15 PM}", 1317212595000, |
| 196 | GRANULARITY_SECOND)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 197 | EXPECT_TRUE(ParsesCorrectly( |
| 198 | "Are sentiments apartments decisively the especially alteration. " |
| 199 | "Thrown shy denote ten ladies though ask saw. Or by to he going " |
| 200 | "think order event music. Incommode so intention defective at " |
| 201 | "convinced. Led income months itself and houses you. After nor " |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 202 | "you leave might share court balls. {19/apr/2010 06:36:15} Are " |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 203 | "sentiments apartments decisively the especially alteration. " |
| 204 | "Thrown shy denote ten ladies though ask saw. Or by to he going " |
| 205 | "think order event music. Incommode so intention defective at " |
| 206 | "convinced. Led income months itself and houses you. After nor " |
| 207 | "you leave might share court balls. ", |
| 208 | 1271651775000, GRANULARITY_SECOND)); |
| 209 | EXPECT_TRUE(ParsesCorrectly("{january 1 2018 at 4:30}", 1514777400000, |
| 210 | GRANULARITY_MINUTE)); |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 211 | EXPECT_TRUE(ParsesCorrectly("{january 1 2018 at 4:30 am}", 1514777400000, |
| 212 | GRANULARITY_MINUTE)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 213 | EXPECT_TRUE(ParsesCorrectly("{january 1 2018 at 4pm}", 1514818800000, |
| 214 | GRANULARITY_HOUR)); |
| 215 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 216 | EXPECT_TRUE(ParsesCorrectly("{today at 0:00}", -3600000, GRANULARITY_MINUTE)); |
| 217 | EXPECT_TRUE(ParsesCorrectly("{today at 0:00}", -57600000, GRANULARITY_MINUTE, |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 218 | /*anchor_start_end=*/false, |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 219 | "America/Los_Angeles")); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 220 | EXPECT_TRUE( |
| 221 | ParsesCorrectly("{tomorrow at 4:00}", 97200000, GRANULARITY_MINUTE)); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 222 | EXPECT_TRUE(ParsesCorrectly("{tomorrow at 4am}", 97200000, GRANULARITY_HOUR)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 223 | EXPECT_TRUE( |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 224 | ParsesCorrectly("{wednesday at 4am}", 529200000, GRANULARITY_HOUR)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 225 | EXPECT_TRUE(ParsesCorrectly("last seen {today at 9:01 PM}", 72060000, |
| 226 | GRANULARITY_MINUTE)); |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 227 | } |
| 228 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 229 | TEST_F(ParserTest, ParseWithAnchor) { |
| 230 | EXPECT_TRUE(ParsesCorrectly("{January 1, 1988}", 567990000000, |
| 231 | GRANULARITY_DAY, /*anchor_start_end=*/false)); |
| 232 | EXPECT_TRUE(ParsesCorrectly("{January 1, 1988}", 567990000000, |
| 233 | GRANULARITY_DAY, /*anchor_start_end=*/true)); |
| 234 | EXPECT_TRUE(ParsesCorrectly("lorem {1 january 2018} ipsum", 1514761200000, |
| 235 | GRANULARITY_DAY, /*anchor_start_end=*/false)); |
| 236 | EXPECT_TRUE(HasNoResult("lorem 1 january 2018 ipsum", |
| 237 | /*anchor_start_end=*/true)); |
| 238 | } |
| 239 | |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 240 | TEST_F(ParserTest, ParseGerman) { |
| 241 | EXPECT_TRUE( |
| 242 | ParsesCorrectlyGerman("{Januar 1 2018}", 1514761200000, GRANULARITY_DAY)); |
| 243 | EXPECT_TRUE( |
| 244 | ParsesCorrectlyGerman("{1 2 2018}", 1517439600000, GRANULARITY_DAY)); |
| 245 | EXPECT_TRUE(ParsesCorrectlyGerman("lorem {1 Januar 2018} ipsum", |
| 246 | 1514761200000, GRANULARITY_DAY)); |
| 247 | EXPECT_TRUE(ParsesCorrectlyGerman("{19/Apr/2010:06:36:15}", 1271651775000, |
| 248 | GRANULARITY_SECOND)); |
| 249 | EXPECT_TRUE(ParsesCorrectlyGerman("{09/März/2004 22:02:40}", 1078866160000, |
| 250 | GRANULARITY_SECOND)); |
| 251 | EXPECT_TRUE(ParsesCorrectlyGerman("{Dez 2, 2010 2:39:58}", 1291253998000, |
| 252 | GRANULARITY_SECOND)); |
| 253 | EXPECT_TRUE(ParsesCorrectlyGerman("{Juni 09 2011 15:28:14}", 1307626094000, |
| 254 | GRANULARITY_SECOND)); |
| 255 | EXPECT_TRUE(ParsesCorrectlyGerman("{März 16 08:12:04}", 6419524000, |
| 256 | GRANULARITY_SECOND)); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 257 | EXPECT_TRUE(ParsesCorrectlyGerman("{2010-06-26 02:31:29}", 1277512289000, |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 258 | GRANULARITY_SECOND)); |
| 259 | EXPECT_TRUE(ParsesCorrectlyGerman("{2006/01/22 04:11:05}", 1137899465000, |
| 260 | GRANULARITY_SECOND)); |
| 261 | EXPECT_TRUE( |
| 262 | ParsesCorrectlyGerman("{11:42:35}", 38555000, GRANULARITY_SECOND)); |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 263 | EXPECT_TRUE(ParsesCorrectlyGerman("{23/Apr 11:42:35}", 9715355000, |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 264 | GRANULARITY_SECOND)); |
| 265 | EXPECT_TRUE(ParsesCorrectlyGerman("{23/Apr/2015:11:42:35}", 1429782155000, |
| 266 | GRANULARITY_SECOND)); |
| 267 | EXPECT_TRUE(ParsesCorrectlyGerman("{23/Apr/2015 11:42:35}", 1429782155000, |
| 268 | GRANULARITY_SECOND)); |
| 269 | EXPECT_TRUE(ParsesCorrectlyGerman("{23-Apr-2015 11:42:35}", 1429782155000, |
| 270 | GRANULARITY_SECOND)); |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 271 | EXPECT_TRUE(ParsesCorrectlyGerman("{23 Apr 2015 11:42:35}", 1429782155000, |
| 272 | GRANULARITY_SECOND)); |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 273 | EXPECT_TRUE(ParsesCorrectlyGerman("{04/23/15 11:42:35}", 1429782155000, |
| 274 | GRANULARITY_SECOND)); |
| 275 | EXPECT_TRUE(ParsesCorrectlyGerman("{04/23/2015 11:42:35}", 1429782155000, |
| 276 | GRANULARITY_SECOND)); |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 277 | EXPECT_TRUE(ParsesCorrectlyGerman("{19/apr/2010:06:36:15}", 1271651775000, |
| 278 | GRANULARITY_SECOND)); |
| 279 | EXPECT_TRUE(ParsesCorrectlyGerman("{januar 1 2018 um 4:30}", 1514777400000, |
| 280 | GRANULARITY_MINUTE)); |
| 281 | EXPECT_TRUE(ParsesCorrectlyGerman("{januar 1 2018 um 4:30 nachm}", |
| 282 | 1514820600000, GRANULARITY_MINUTE)); |
| 283 | EXPECT_TRUE(ParsesCorrectlyGerman("{januar 1 2018 um 4 nachm}", 1514818800000, |
| 284 | GRANULARITY_HOUR)); |
| 285 | EXPECT_TRUE( |
| 286 | ParsesCorrectlyGerman("{14.03.2017}", 1489446000000, GRANULARITY_DAY)); |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 287 | EXPECT_TRUE( |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 288 | ParsesCorrectlyGerman("{morgen 0:00}", 82800000, GRANULARITY_MINUTE)); |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 289 | EXPECT_TRUE( |
| 290 | ParsesCorrectlyGerman("{morgen um 4:00}", 97200000, GRANULARITY_MINUTE)); |
| 291 | EXPECT_TRUE( |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 292 | ParsesCorrectlyGerman("{morgen um 4 vorm}", 97200000, GRANULARITY_HOUR)); |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 293 | } |
| 294 | |
| 295 | TEST_F(ParserTest, ParseNonUs) { |
| 296 | EXPECT_TRUE(ParsesCorrectly("{1/5/15}", 1430431200000, GRANULARITY_DAY, |
| 297 | /*anchor_start_end=*/false, |
| 298 | /*timezone=*/"Europe/Zurich", |
| 299 | /*locales=*/"en-GB")); |
| 300 | EXPECT_TRUE(ParsesCorrectly("{1/5/15}", 1430431200000, GRANULARITY_DAY, |
| 301 | /*anchor_start_end=*/false, |
| 302 | /*timezone=*/"Europe/Zurich", /*locales=*/"en")); |
| 303 | } |
| 304 | |
| 305 | TEST_F(ParserTest, ParseUs) { |
| 306 | EXPECT_TRUE(ParsesCorrectly("{1/5/15}", 1420412400000, GRANULARITY_DAY, |
| 307 | /*anchor_start_end=*/false, |
| 308 | /*timezone=*/"Europe/Zurich", |
| 309 | /*locales=*/"en-US")); |
| 310 | EXPECT_TRUE(ParsesCorrectly("{1/5/15}", 1420412400000, GRANULARITY_DAY, |
| 311 | /*anchor_start_end=*/false, |
| 312 | /*timezone=*/"Europe/Zurich", |
| 313 | /*locales=*/"es-US")); |
| 314 | } |
| 315 | |
| 316 | TEST_F(ParserTest, ParseUnknownLanguage) { |
| 317 | EXPECT_TRUE(ParsesCorrectly("bylo to {31. 12. 2015} v 6 hodin", 1451516400000, |
| 318 | GRANULARITY_DAY, |
| 319 | /*anchor_start_end=*/false, |
| 320 | /*timezone=*/"Europe/Zurich", /*locales=*/"xx")); |
| 321 | } |
| 322 | |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 323 | class ParserLocaleTest : public testing::Test { |
| 324 | public: |
| 325 | void SetUp() override; |
| 326 | bool HasResult(const std::string& input, const std::string& locales); |
| 327 | |
| 328 | protected: |
| 329 | UniLib unilib_; |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 330 | CalendarLib calendarlib_; |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 331 | flatbuffers::FlatBufferBuilder builder_; |
| 332 | std::unique_ptr<DatetimeParser> parser_; |
| 333 | }; |
| 334 | |
| 335 | void AddPattern(const std::string& regex, int locale, |
| 336 | std::vector<std::unique_ptr<DatetimeModelPatternT>>* patterns) { |
| 337 | patterns->emplace_back(new DatetimeModelPatternT); |
| 338 | patterns->back()->regexes.emplace_back(new DatetimeModelPattern_::RegexT); |
| 339 | patterns->back()->regexes.back()->pattern = regex; |
| 340 | patterns->back()->regexes.back()->groups.push_back( |
| 341 | DatetimeGroupType_GROUP_UNUSED); |
| 342 | patterns->back()->locales.push_back(locale); |
| 343 | } |
| 344 | |
| 345 | void ParserLocaleTest::SetUp() { |
| 346 | DatetimeModelT model; |
| 347 | model.use_extractors_for_locating = false; |
| 348 | model.locales.clear(); |
| 349 | model.locales.push_back("en-US"); |
| 350 | model.locales.push_back("en-CH"); |
| 351 | model.locales.push_back("zh-Hant"); |
| 352 | model.locales.push_back("en-*"); |
| 353 | model.locales.push_back("zh-Hant-*"); |
| 354 | model.locales.push_back("*-CH"); |
Lukas Zilka | 434442d | 2018-04-25 11:38:51 +0200 | [diff] [blame] | 355 | model.locales.push_back("default"); |
| 356 | model.default_locales.push_back(6); |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 357 | |
| 358 | AddPattern(/*regex=*/"en-US", /*locale=*/0, &model.patterns); |
| 359 | AddPattern(/*regex=*/"en-CH", /*locale=*/1, &model.patterns); |
| 360 | AddPattern(/*regex=*/"zh-Hant", /*locale=*/2, &model.patterns); |
| 361 | AddPattern(/*regex=*/"en-all", /*locale=*/3, &model.patterns); |
| 362 | AddPattern(/*regex=*/"zh-Hant-all", /*locale=*/4, &model.patterns); |
| 363 | AddPattern(/*regex=*/"all-CH", /*locale=*/5, &model.patterns); |
| 364 | AddPattern(/*regex=*/"default", /*locale=*/6, &model.patterns); |
| 365 | |
| 366 | builder_.Finish(DatetimeModel::Pack(builder_, &model)); |
| 367 | const DatetimeModel* model_fb = |
| 368 | flatbuffers::GetRoot<DatetimeModel>(builder_.GetBufferPointer()); |
| 369 | ASSERT_TRUE(model_fb); |
| 370 | |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 371 | parser_ = DatetimeParser::Instance(model_fb, unilib_, calendarlib_, |
Lukas Zilka | e7962cc | 2018-03-28 18:09:48 +0200 | [diff] [blame] | 372 | /*decompressor=*/nullptr); |
| 373 | ASSERT_TRUE(parser_); |
| 374 | } |
| 375 | |
| 376 | bool ParserLocaleTest::HasResult(const std::string& input, |
| 377 | const std::string& locales) { |
| 378 | std::vector<DatetimeParseResultSpan> results; |
| 379 | EXPECT_TRUE(parser_->Parse(input, /*reference_time_ms_utc=*/0, |
| 380 | /*reference_timezone=*/"", locales, |
| 381 | ModeFlag_ANNOTATION, false, &results)); |
| 382 | return results.size() == 1; |
| 383 | } |
| 384 | |
| 385 | TEST_F(ParserLocaleTest, English) { |
| 386 | EXPECT_TRUE(HasResult("en-US", /*locales=*/"en-US")); |
| 387 | EXPECT_FALSE(HasResult("en-CH", /*locales=*/"en-US")); |
| 388 | EXPECT_FALSE(HasResult("en-US", /*locales=*/"en-CH")); |
| 389 | EXPECT_TRUE(HasResult("en-CH", /*locales=*/"en-CH")); |
| 390 | EXPECT_TRUE(HasResult("default", /*locales=*/"en-CH")); |
| 391 | } |
| 392 | |
| 393 | TEST_F(ParserLocaleTest, TraditionalChinese) { |
| 394 | EXPECT_TRUE(HasResult("zh-Hant-all", /*locales=*/"zh-Hant")); |
| 395 | EXPECT_TRUE(HasResult("zh-Hant-all", /*locales=*/"zh-Hant-TW")); |
| 396 | EXPECT_TRUE(HasResult("zh-Hant-all", /*locales=*/"zh-Hant-SG")); |
| 397 | EXPECT_FALSE(HasResult("zh-Hant-all", /*locales=*/"zh-SG")); |
| 398 | EXPECT_FALSE(HasResult("zh-Hant-all", /*locales=*/"zh")); |
| 399 | EXPECT_TRUE(HasResult("default", /*locales=*/"zh")); |
| 400 | EXPECT_TRUE(HasResult("default", /*locales=*/"zh-Hant-SG")); |
| 401 | } |
| 402 | |
| 403 | TEST_F(ParserLocaleTest, SwissEnglish) { |
| 404 | EXPECT_TRUE(HasResult("all-CH", /*locales=*/"de-CH")); |
| 405 | EXPECT_TRUE(HasResult("all-CH", /*locales=*/"en-CH")); |
| 406 | EXPECT_TRUE(HasResult("en-all", /*locales=*/"en-CH")); |
| 407 | EXPECT_FALSE(HasResult("all-CH", /*locales=*/"de-DE")); |
| 408 | EXPECT_TRUE(HasResult("default", /*locales=*/"de-CH")); |
| 409 | EXPECT_TRUE(HasResult("default", /*locales=*/"en-CH")); |
| 410 | } |
Lukas Zilka | b23e212 | 2018-02-09 10:25:19 +0100 | [diff] [blame] | 411 | |
| 412 | } // namespace |
Tony Mak | 6c4cc67 | 2018-09-17 11:48:50 +0100 | [diff] [blame] | 413 | } // namespace libtextclassifier3 |