blob: 36525e2ec6b0a719e6364e19e85789ab57d4adfe [file] [log] [blame]
Lukas Zilkab23e2122018-02-09 10:25:19 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <time.h>
18#include <fstream>
19#include <iostream>
20#include <memory>
21#include <string>
22
23#include "gmock/gmock.h"
24#include "gtest/gtest.h"
25
26#include "datetime/parser.h"
27#include "model_generated.h"
Lukas Zilkae7962cc2018-03-28 18:09:48 +020028#include "text-classifier.h"
Lukas Zilkab23e2122018-02-09 10:25:19 +010029#include "types-test-util.h"
30
31using testing::ElementsAreArray;
32
33namespace libtextclassifier2 {
34namespace {
35
36std::string GetModelPath() {
37 return LIBTEXTCLASSIFIER_TEST_DATA_DIR;
38}
39
40std::string ReadFile(const std::string& file_name) {
41 std::ifstream file_stream(file_name);
42 return std::string(std::istreambuf_iterator<char>(file_stream), {});
43}
44
45std::string FormatMillis(int64 time_ms_utc) {
46 long time_seconds = time_ms_utc / 1000; // NOLINT
47 // Format time, "ddd yyyy-mm-dd hh:mm:ss zzz"
48 char buffer[512];
49 strftime(buffer, sizeof(buffer), "%a %Y-%m-%d %H:%M:%S %Z",
50 localtime(&time_seconds));
51 return std::string(buffer);
52}
53
54class ParserTest : public testing::Test {
55 public:
56 void SetUp() override {
57 model_buffer_ = ReadFile(GetModelPath() + "test_model.fb");
Lukas Zilkae7962cc2018-03-28 18:09:48 +020058 classifier_ = TextClassifier::FromUnownedBuffer(
59 model_buffer_.data(), model_buffer_.size(), &unilib_);
60 TC_CHECK(classifier_);
61 parser_ = classifier_->DatetimeParserForTests();
62 }
63
64 bool HasNoResult(const std::string& text, bool anchor_start_end = false,
65 const std::string& timezone = "Europe/Zurich") {
66 std::vector<DatetimeParseResultSpan> results;
67 if (!parser_->Parse(text, 0, timezone, /*locales=*/"", ModeFlag_ANNOTATION,
68 anchor_start_end, &results)) {
69 TC_LOG(ERROR) << text;
70 TC_CHECK(false);
71 }
72 return results.empty();
Lukas Zilkab23e2122018-02-09 10:25:19 +010073 }
74
75 bool ParsesCorrectly(const std::string& marked_text,
76 const int64 expected_ms_utc,
77 DatetimeGranularity expected_granularity,
Lukas Zilkae7962cc2018-03-28 18:09:48 +020078 bool anchor_start_end = false,
Lukas Zilkab23e2122018-02-09 10:25:19 +010079 const std::string& timezone = "Europe/Zurich") {
80 auto expected_start_index = marked_text.find("{");
81 EXPECT_TRUE(expected_start_index != std::string::npos);
82 auto expected_end_index = marked_text.find("}");
83 EXPECT_TRUE(expected_end_index != std::string::npos);
84
85 std::string text;
86 text += std::string(marked_text.begin(),
87 marked_text.begin() + expected_start_index);
88 text += std::string(marked_text.begin() + expected_start_index + 1,
89 marked_text.begin() + expected_end_index);
90 text += std::string(marked_text.begin() + expected_end_index + 1,
91 marked_text.end());
92
93 std::vector<DatetimeParseResultSpan> results;
94
Lukas Zilkaba849e72018-03-08 14:48:21 +010095 if (!parser_->Parse(text, 0, timezone, /*locales=*/"", ModeFlag_ANNOTATION,
Lukas Zilkae7962cc2018-03-28 18:09:48 +020096 anchor_start_end, &results)) {
Lukas Zilkab23e2122018-02-09 10:25:19 +010097 TC_LOG(ERROR) << text;
98 TC_CHECK(false);
99 }
100 EXPECT_TRUE(!results.empty());
101
102 std::vector<DatetimeParseResultSpan> filtered_results;
103 for (const DatetimeParseResultSpan& result : results) {
104 if (SpansOverlap(result.span,
105 {expected_start_index, expected_end_index})) {
106 filtered_results.push_back(result);
107 }
108 }
109
110 const std::vector<DatetimeParseResultSpan> expected{
111 {{expected_start_index, expected_end_index - 1},
112 {expected_ms_utc, expected_granularity},
113 /*target_classification_score=*/1.0,
114 /*priority_score=*/0.0}};
115 const bool matches =
116 testing::Matches(ElementsAreArray(expected))(filtered_results);
117 if (!matches) {
118 TC_LOG(ERROR) << "Expected: " << expected[0] << " which corresponds to: "
119 << FormatMillis(expected[0].data.time_ms_utc);
120 for (int i = 0; i < filtered_results.size(); ++i) {
121 TC_LOG(ERROR) << "Actual[" << i << "]: " << filtered_results[i]
122 << " which corresponds to: "
123 << FormatMillis(filtered_results[i].data.time_ms_utc);
124 }
125 }
126 return matches;
127 }
128
129 protected:
130 std::string model_buffer_;
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200131 std::unique_ptr<TextClassifier> classifier_;
132 const DatetimeParser* parser_;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100133 UniLib unilib_;
134};
135
136// Test with just a few cases to make debugging of general failures easier.
137TEST_F(ParserTest, ParseShort) {
138 EXPECT_TRUE(
139 ParsesCorrectly("{January 1, 1988}", 567990000000, GRANULARITY_DAY));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100140 EXPECT_TRUE(ParsesCorrectly("{three days ago}", -262800000, GRANULARITY_DAY));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100141}
142
143TEST_F(ParserTest, Parse) {
144 EXPECT_TRUE(
145 ParsesCorrectly("{January 1, 1988}", 567990000000, GRANULARITY_DAY));
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200146 EXPECT_TRUE(ParsesCorrectly("{1 2 2018}", 1514847600000, GRANULARITY_DAY));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100147 EXPECT_TRUE(
148 ParsesCorrectly("{january 31 2018}", 1517353200000, GRANULARITY_DAY));
149 EXPECT_TRUE(ParsesCorrectly("lorem {1 january 2018} ipsum", 1514761200000,
150 GRANULARITY_DAY));
151 EXPECT_TRUE(ParsesCorrectly("{19/apr/2010:06:36:15}", 1271651775000,
152 GRANULARITY_SECOND));
153 EXPECT_TRUE(ParsesCorrectly("{09/Mar/2004 22:02:40}", 1078866160000,
154 GRANULARITY_SECOND));
155 EXPECT_TRUE(ParsesCorrectly("{Dec 2, 2010 2:39:58 AM}", 1291253998000,
156 GRANULARITY_SECOND));
157 EXPECT_TRUE(ParsesCorrectly("{Jun 09 2011 15:28:14}", 1307626094000,
158 GRANULARITY_SECOND));
159 EXPECT_TRUE(ParsesCorrectly("{Apr 20 00:00:35 2010}", 1271714435000,
160 GRANULARITY_SECOND));
161 EXPECT_TRUE(
162 ParsesCorrectly("{Mar 16 08:12:04}", 6419524000, GRANULARITY_SECOND));
163 EXPECT_TRUE(ParsesCorrectly("{2012-10-14T22:11:20}", 1350245480000,
164 GRANULARITY_SECOND));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100165 EXPECT_TRUE(ParsesCorrectly("{2014-07-01T14:59:55}.711Z", 1404219595000,
Lukas Zilkab23e2122018-02-09 10:25:19 +0100166 GRANULARITY_SECOND));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100167 EXPECT_TRUE(ParsesCorrectly("{2010-06-26 02:31:29},573", 1277512289000,
Lukas Zilkab23e2122018-02-09 10:25:19 +0100168 GRANULARITY_SECOND));
169 EXPECT_TRUE(ParsesCorrectly("{2006/01/22 04:11:05}", 1137899465000,
170 GRANULARITY_SECOND));
171 EXPECT_TRUE(
172 ParsesCorrectly("{150423 11:42:35}", 1429782155000, GRANULARITY_SECOND));
173 EXPECT_TRUE(ParsesCorrectly("{11:42:35}", 38555000, GRANULARITY_SECOND));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100174 EXPECT_TRUE(ParsesCorrectly("{11:42:35}.173", 38555000, GRANULARITY_SECOND));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100175 EXPECT_TRUE(
Lukas Zilkaba849e72018-03-08 14:48:21 +0100176 ParsesCorrectly("{23/Apr 11:42:35},173", 9715355000, GRANULARITY_SECOND));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100177 EXPECT_TRUE(ParsesCorrectly("{23/Apr/2015:11:42:35}", 1429782155000,
178 GRANULARITY_SECOND));
179 EXPECT_TRUE(ParsesCorrectly("{23/Apr/2015 11:42:35}", 1429782155000,
180 GRANULARITY_SECOND));
181 EXPECT_TRUE(ParsesCorrectly("{23-Apr-2015 11:42:35}", 1429782155000,
182 GRANULARITY_SECOND));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100183 EXPECT_TRUE(ParsesCorrectly("{23-Apr-2015 11:42:35}.883", 1429782155000,
Lukas Zilkab23e2122018-02-09 10:25:19 +0100184 GRANULARITY_SECOND));
185 EXPECT_TRUE(ParsesCorrectly("{23 Apr 2015 11:42:35}", 1429782155000,
186 GRANULARITY_SECOND));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100187 EXPECT_TRUE(ParsesCorrectly("{23 Apr 2015 11:42:35}.883", 1429782155000,
Lukas Zilkab23e2122018-02-09 10:25:19 +0100188 GRANULARITY_SECOND));
189 EXPECT_TRUE(ParsesCorrectly("{04/23/15 11:42:35}", 1429782155000,
190 GRANULARITY_SECOND));
191 EXPECT_TRUE(ParsesCorrectly("{04/23/2015 11:42:35}", 1429782155000,
192 GRANULARITY_SECOND));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100193 EXPECT_TRUE(ParsesCorrectly("{04/23/2015 11:42:35}.883", 1429782155000,
Lukas Zilkab23e2122018-02-09 10:25:19 +0100194 GRANULARITY_SECOND));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100195 EXPECT_TRUE(ParsesCorrectly("{8/5/2011 3:31:18 AM}:234}", 1312507878000,
Lukas Zilkab23e2122018-02-09 10:25:19 +0100196 GRANULARITY_SECOND));
197 EXPECT_TRUE(ParsesCorrectly("{9/28/2011 2:23:15 PM}", 1317212595000,
198 GRANULARITY_SECOND));
199 EXPECT_TRUE(ParsesCorrectly("{19/apr/2010:06:36:15}", 1271651775000,
200 GRANULARITY_SECOND));
201 EXPECT_TRUE(ParsesCorrectly(
202 "Are sentiments apartments decisively the especially alteration. "
203 "Thrown shy denote ten ladies though ask saw. Or by to he going "
204 "think order event music. Incommode so intention defective at "
205 "convinced. Led income months itself and houses you. After nor "
206 "you leave might share court balls. {19/apr/2010:06:36:15} Are "
207 "sentiments apartments decisively the especially alteration. "
208 "Thrown shy denote ten ladies though ask saw. Or by to he going "
209 "think order event music. Incommode so intention defective at "
210 "convinced. Led income months itself and houses you. After nor "
211 "you leave might share court balls. ",
212 1271651775000, GRANULARITY_SECOND));
213 EXPECT_TRUE(ParsesCorrectly("{january 1 2018 at 4:30}", 1514777400000,
214 GRANULARITY_MINUTE));
215 EXPECT_TRUE(ParsesCorrectly("{january 1 2018 at 4}", 1514775600000,
216 GRANULARITY_HOUR));
217 EXPECT_TRUE(ParsesCorrectly("{january 1 2018 at 4pm}", 1514818800000,
218 GRANULARITY_HOUR));
219
Lukas Zilkaba849e72018-03-08 14:48:21 +0100220 EXPECT_TRUE(ParsesCorrectly("{today}", -3600000, GRANULARITY_DAY));
221 EXPECT_TRUE(ParsesCorrectly("{today}", -57600000, GRANULARITY_DAY,
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200222 /*anchor_start_end=*/false,
Lukas Zilkab23e2122018-02-09 10:25:19 +0100223 "America/Los_Angeles"));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100224 EXPECT_TRUE(ParsesCorrectly("{next week}", 255600000, GRANULARITY_WEEK));
225 EXPECT_TRUE(ParsesCorrectly("{next day}", 82800000, GRANULARITY_DAY));
226 EXPECT_TRUE(ParsesCorrectly("{in three days}", 255600000, GRANULARITY_DAY));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100227 EXPECT_TRUE(
Lukas Zilkaba849e72018-03-08 14:48:21 +0100228 ParsesCorrectly("{in three weeks}", 1465200000, GRANULARITY_WEEK));
229 EXPECT_TRUE(ParsesCorrectly("{tomorrow}", 82800000, GRANULARITY_DAY));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100230 EXPECT_TRUE(
231 ParsesCorrectly("{tomorrow at 4:00}", 97200000, GRANULARITY_MINUTE));
232 EXPECT_TRUE(ParsesCorrectly("{tomorrow at 4}", 97200000, GRANULARITY_HOUR));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100233 EXPECT_TRUE(ParsesCorrectly("{next wednesday}", 514800000, GRANULARITY_DAY));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100234 EXPECT_TRUE(
235 ParsesCorrectly("{next wednesday at 4}", 529200000, GRANULARITY_HOUR));
236 EXPECT_TRUE(ParsesCorrectly("last seen {today at 9:01 PM}", 72060000,
237 GRANULARITY_MINUTE));
Lukas Zilkaba849e72018-03-08 14:48:21 +0100238 EXPECT_TRUE(ParsesCorrectly("{Three days ago}", -262800000, GRANULARITY_DAY));
239 EXPECT_TRUE(ParsesCorrectly("{three days ago}", -262800000, GRANULARITY_DAY));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100240}
241
Lukas Zilkae7962cc2018-03-28 18:09:48 +0200242TEST_F(ParserTest, ParseWithAnchor) {
243 EXPECT_TRUE(ParsesCorrectly("{January 1, 1988}", 567990000000,
244 GRANULARITY_DAY, /*anchor_start_end=*/false));
245 EXPECT_TRUE(ParsesCorrectly("{January 1, 1988}", 567990000000,
246 GRANULARITY_DAY, /*anchor_start_end=*/true));
247 EXPECT_TRUE(ParsesCorrectly("lorem {1 january 2018} ipsum", 1514761200000,
248 GRANULARITY_DAY, /*anchor_start_end=*/false));
249 EXPECT_TRUE(HasNoResult("lorem 1 january 2018 ipsum",
250 /*anchor_start_end=*/true));
251}
252
253class ParserLocaleTest : public testing::Test {
254 public:
255 void SetUp() override;
256 bool HasResult(const std::string& input, const std::string& locales);
257
258 protected:
259 UniLib unilib_;
260 flatbuffers::FlatBufferBuilder builder_;
261 std::unique_ptr<DatetimeParser> parser_;
262};
263
264void AddPattern(const std::string& regex, int locale,
265 std::vector<std::unique_ptr<DatetimeModelPatternT>>* patterns) {
266 patterns->emplace_back(new DatetimeModelPatternT);
267 patterns->back()->regexes.emplace_back(new DatetimeModelPattern_::RegexT);
268 patterns->back()->regexes.back()->pattern = regex;
269 patterns->back()->regexes.back()->groups.push_back(
270 DatetimeGroupType_GROUP_UNUSED);
271 patterns->back()->locales.push_back(locale);
272}
273
274void ParserLocaleTest::SetUp() {
275 DatetimeModelT model;
276 model.use_extractors_for_locating = false;
277 model.locales.clear();
278 model.locales.push_back("en-US");
279 model.locales.push_back("en-CH");
280 model.locales.push_back("zh-Hant");
281 model.locales.push_back("en-*");
282 model.locales.push_back("zh-Hant-*");
283 model.locales.push_back("*-CH");
284 model.locales.push_back("");
285
286 AddPattern(/*regex=*/"en-US", /*locale=*/0, &model.patterns);
287 AddPattern(/*regex=*/"en-CH", /*locale=*/1, &model.patterns);
288 AddPattern(/*regex=*/"zh-Hant", /*locale=*/2, &model.patterns);
289 AddPattern(/*regex=*/"en-all", /*locale=*/3, &model.patterns);
290 AddPattern(/*regex=*/"zh-Hant-all", /*locale=*/4, &model.patterns);
291 AddPattern(/*regex=*/"all-CH", /*locale=*/5, &model.patterns);
292 AddPattern(/*regex=*/"default", /*locale=*/6, &model.patterns);
293
294 builder_.Finish(DatetimeModel::Pack(builder_, &model));
295 const DatetimeModel* model_fb =
296 flatbuffers::GetRoot<DatetimeModel>(builder_.GetBufferPointer());
297 ASSERT_TRUE(model_fb);
298
299 parser_ = DatetimeParser::Instance(model_fb, unilib_,
300 /*decompressor=*/nullptr);
301 ASSERT_TRUE(parser_);
302}
303
304bool ParserLocaleTest::HasResult(const std::string& input,
305 const std::string& locales) {
306 std::vector<DatetimeParseResultSpan> results;
307 EXPECT_TRUE(parser_->Parse(input, /*reference_time_ms_utc=*/0,
308 /*reference_timezone=*/"", locales,
309 ModeFlag_ANNOTATION, false, &results));
310 return results.size() == 1;
311}
312
313TEST_F(ParserLocaleTest, English) {
314 EXPECT_TRUE(HasResult("en-US", /*locales=*/"en-US"));
315 EXPECT_FALSE(HasResult("en-CH", /*locales=*/"en-US"));
316 EXPECT_FALSE(HasResult("en-US", /*locales=*/"en-CH"));
317 EXPECT_TRUE(HasResult("en-CH", /*locales=*/"en-CH"));
318 EXPECT_TRUE(HasResult("default", /*locales=*/"en-CH"));
319}
320
321TEST_F(ParserLocaleTest, TraditionalChinese) {
322 EXPECT_TRUE(HasResult("zh-Hant-all", /*locales=*/"zh-Hant"));
323 EXPECT_TRUE(HasResult("zh-Hant-all", /*locales=*/"zh-Hant-TW"));
324 EXPECT_TRUE(HasResult("zh-Hant-all", /*locales=*/"zh-Hant-SG"));
325 EXPECT_FALSE(HasResult("zh-Hant-all", /*locales=*/"zh-SG"));
326 EXPECT_FALSE(HasResult("zh-Hant-all", /*locales=*/"zh"));
327 EXPECT_TRUE(HasResult("default", /*locales=*/"zh"));
328 EXPECT_TRUE(HasResult("default", /*locales=*/"zh-Hant-SG"));
329}
330
331TEST_F(ParserLocaleTest, SwissEnglish) {
332 EXPECT_TRUE(HasResult("all-CH", /*locales=*/"de-CH"));
333 EXPECT_TRUE(HasResult("all-CH", /*locales=*/"en-CH"));
334 EXPECT_TRUE(HasResult("en-all", /*locales=*/"en-CH"));
335 EXPECT_FALSE(HasResult("all-CH", /*locales=*/"de-DE"));
336 EXPECT_TRUE(HasResult("default", /*locales=*/"de-CH"));
337 EXPECT_TRUE(HasResult("default", /*locales=*/"en-CH"));
338}
Lukas Zilkab23e2122018-02-09 10:25:19 +0100339
340} // namespace
341} // namespace libtextclassifier2