blob: 1a9b9da2a75ca6258329c416c9727d73034a60b2 [file] [log] [blame]
Matt Sharifid40f9762017-03-14 21:24:23 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "smartselect/feature-processor.h"
18
19#include "gmock/gmock.h"
20#include "gtest/gtest.h"
21
22namespace libtextclassifier {
23namespace {
24
25using testing::ElementsAreArray;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +020026using testing::FloatEq;
Matt Sharifid40f9762017-03-14 21:24:23 +010027
28TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020029 std::vector<Token> tokens{Token("Hělló", 0, 5),
30 Token("fěěbař@google.com", 6, 23),
31 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010032
33 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
34
35 // clang-format off
36 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020037 {Token("Hělló", 0, 5),
38 Token("fěě", 6, 9),
39 Token("bař", 9, 12),
40 Token("@google.com", 12, 23),
41 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010042 // clang-format on
43}
44
45TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020046 std::vector<Token> tokens{Token("Hělló", 0, 5),
47 Token("fěěbař@google.com", 6, 23),
48 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010049
50 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
51
52 // clang-format off
53 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020054 {Token("Hělló", 0, 5),
55 Token("fěěbař", 6, 12),
56 Token("@google.com", 12, 23),
57 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010058 // clang-format on
59}
60
61TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020062 std::vector<Token> tokens{Token("Hělló", 0, 5),
63 Token("fěěbař@google.com", 6, 23),
64 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010065
66 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
67
68 // clang-format off
69 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020070 {Token("Hělló", 0, 5),
71 Token("fěě", 6, 9),
72 Token("bař@google.com", 9, 23),
73 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010074 // clang-format on
75}
76
77TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020078 std::vector<Token> tokens{Token("Hělló", 0, 5),
79 Token("fěěbař@google.com", 6, 23),
80 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010081
82 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
83
84 // clang-format off
85 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020086 {Token("Hělló", 0, 5),
87 Token("fěěbař@google.com", 6, 23),
88 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010089 // clang-format on
90}
91
92TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020093 std::vector<Token> tokens{Token("Hělló", 0, 5),
94 Token("fěěbař@google.com", 6, 23),
95 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010096
97 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
98
99 // clang-format off
100 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200101 {Token("Hě", 0, 2),
102 Token("lló", 2, 5),
103 Token("fěě", 6, 9),
104 Token("bař@google.com", 9, 23),
105 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100106 // clang-format on
107}
108
109TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100110 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
111 const CodepointSpan span = {0, 5};
112 // clang-format off
113 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
114 Token("Lině", 6, 10),
115 Token("Sěcond", 11, 17),
116 Token("Lině", 18, 22),
117 Token("Thiřd", 23, 28),
118 Token("Lině", 29, 33)};
119 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100120
121 // Keeps the first line.
Matt Sharifibe876dc2017-03-17 17:02:43 +0100122 internal::StripTokensFromOtherLines(context, span, &tokens);
123 EXPECT_THAT(tokens,
124 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100125}
126
127TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100128 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
129 const CodepointSpan span = {18, 22};
130 // clang-format off
131 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
132 Token("Lině", 6, 10),
133 Token("Sěcond", 11, 17),
134 Token("Lině", 18, 22),
135 Token("Thiřd", 23, 28),
136 Token("Lině", 29, 33)};
137 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100138
Matt Sharifibe876dc2017-03-17 17:02:43 +0100139 // Keeps the first line.
140 internal::StripTokensFromOtherLines(context, span, &tokens);
141 EXPECT_THAT(tokens, ElementsAreArray(
142 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100143}
144
145TEST(FeatureProcessorTest, KeepLineWithClickThird) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100146 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
147 const CodepointSpan span = {24, 33};
148 // clang-format off
149 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
150 Token("Lině", 6, 10),
151 Token("Sěcond", 11, 17),
152 Token("Lině", 18, 22),
153 Token("Thiřd", 23, 28),
154 Token("Lině", 29, 33)};
155 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100156
Matt Sharifibe876dc2017-03-17 17:02:43 +0100157 // Keeps the first line.
158 internal::StripTokensFromOtherLines(context, span, &tokens);
159 EXPECT_THAT(tokens, ElementsAreArray(
160 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100161}
162
163TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100164 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
165 const CodepointSpan span = {18, 22};
166 // clang-format off
167 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
168 Token("Lině", 6, 10),
169 Token("Sěcond", 11, 17),
170 Token("Lině", 18, 22),
171 Token("Thiřd", 23, 28),
172 Token("Lině", 29, 33)};
173 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100174
Matt Sharifibe876dc2017-03-17 17:02:43 +0100175 // Keeps the first line.
176 internal::StripTokensFromOtherLines(context, span, &tokens);
177 EXPECT_THAT(tokens, ElementsAreArray(
178 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100179}
180
181TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100182 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
183 const CodepointSpan span = {5, 23};
184 // clang-format off
185 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
186 Token("Lině", 6, 10),
187 Token("Sěcond", 18, 23),
188 Token("Lině", 19, 23),
189 Token("Thiřd", 23, 28),
190 Token("Lině", 29, 33)};
191 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100192
Matt Sharifibe876dc2017-03-17 17:02:43 +0100193 // Keeps the first line.
194 internal::StripTokensFromOtherLines(context, span, &tokens);
195 EXPECT_THAT(tokens, ElementsAreArray(
196 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
197 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
198 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100199}
200
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200201class TestingFeatureProcessor : public FeatureProcessor {
202 public:
203 using FeatureProcessor::FeatureProcessor;
204 using FeatureProcessor::SpanToLabel;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200205 using FeatureProcessor::SupportedCodepointsRatio;
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200206 using FeatureProcessor::IsCodepointInRanges;
Lukas Zilka40c18de2017-04-10 17:22:22 +0200207 using FeatureProcessor::ICUTokenize;
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200208 using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200209 using FeatureProcessor::supported_codepoint_ranges_;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200210};
211
212TEST(FeatureProcessorTest, SpanToLabel) {
213 FeatureProcessorOptions options;
214 options.set_context_size(1);
215 options.set_max_selection_span(1);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200216 options.set_snap_label_span_boundaries_to_containing_tokens(false);
217
218 TokenizationCodepointRange* config =
219 options.add_tokenization_codepoint_config();
220 config->set_start(32);
221 config->set_end(33);
222 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
223
224 TestingFeatureProcessor feature_processor(options);
225 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
226 ASSERT_EQ(3, tokens.size());
227 int label;
228 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
229 EXPECT_EQ(kInvalidLabel, label);
230 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
231 EXPECT_NE(kInvalidLabel, label);
232 TokenSpan token_span;
233 feature_processor.LabelToTokenSpan(label, &token_span);
234 EXPECT_EQ(0, token_span.first);
235 EXPECT_EQ(0, token_span.second);
236
237 // Reconfigure with snapping enabled.
238 options.set_snap_label_span_boundaries_to_containing_tokens(true);
239 TestingFeatureProcessor feature_processor2(options);
240 int label2;
241 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
242 EXPECT_EQ(label, label2);
243 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
244 EXPECT_EQ(label, label2);
245 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
246 EXPECT_EQ(label, label2);
247
248 // Cross a token boundary.
249 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
250 EXPECT_EQ(kInvalidLabel, label2);
251 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
252 EXPECT_EQ(kInvalidLabel, label2);
253
254 // Multiple tokens.
255 options.set_context_size(2);
256 options.set_max_selection_span(2);
257 TestingFeatureProcessor feature_processor3(options);
258 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
259 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
260 EXPECT_NE(kInvalidLabel, label2);
261 feature_processor3.LabelToTokenSpan(label2, &token_span);
262 EXPECT_EQ(1, token_span.first);
263 EXPECT_EQ(0, token_span.second);
264
265 int label3;
266 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
267 EXPECT_EQ(label2, label3);
268 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
269 EXPECT_EQ(label2, label3);
270 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
271 EXPECT_EQ(label2, label3);
272}
273
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200274TEST(FeatureProcessorTest, SpanToLabelIgnoresPunctuation) {
275 FeatureProcessorOptions options;
276 options.set_context_size(1);
277 options.set_max_selection_span(1);
278 options.set_snap_label_span_boundaries_to_containing_tokens(false);
279
280 TokenizationCodepointRange* config =
281 options.add_tokenization_codepoint_config();
282 config->set_start(32);
283 config->set_end(33);
284 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
285
286 TestingFeatureProcessor feature_processor(options);
287 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
288 ASSERT_EQ(3, tokens.size());
289 int label;
290 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
291 EXPECT_EQ(kInvalidLabel, label);
292 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
293 EXPECT_NE(kInvalidLabel, label);
294 TokenSpan token_span;
295 feature_processor.LabelToTokenSpan(label, &token_span);
296 EXPECT_EQ(0, token_span.first);
297 EXPECT_EQ(0, token_span.second);
298
299 // Reconfigure with snapping enabled.
300 options.set_snap_label_span_boundaries_to_containing_tokens(true);
301 TestingFeatureProcessor feature_processor2(options);
302 int label2;
303 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
304 EXPECT_EQ(label, label2);
305 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
306 EXPECT_EQ(label, label2);
307 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
308 EXPECT_EQ(label, label2);
309
310 // Cross a token boundary.
311 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
312 EXPECT_EQ(kInvalidLabel, label2);
313 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
314 EXPECT_EQ(kInvalidLabel, label2);
315
316 // Multiple tokens.
317 options.set_context_size(2);
318 options.set_max_selection_span(2);
319 TestingFeatureProcessor feature_processor3(options);
320 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
321 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
322 EXPECT_NE(kInvalidLabel, label2);
323 feature_processor3.LabelToTokenSpan(label2, &token_span);
324 EXPECT_EQ(1, token_span.first);
325 EXPECT_EQ(0, token_span.second);
326
327 int label3;
328 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
329 EXPECT_EQ(label2, label3);
330 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
331 EXPECT_EQ(label2, label3);
332 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
333 EXPECT_EQ(label2, label3);
334}
335
Matt Sharifibe876dc2017-03-17 17:02:43 +0100336TEST(FeatureProcessorTest, CenterTokenFromClick) {
337 int token_index;
338
339 // Exactly aligned indices.
340 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200341 {6, 11},
342 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100343 EXPECT_EQ(token_index, 1);
344
345 // Click is contained in a token.
346 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200347 {13, 17},
348 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100349 EXPECT_EQ(token_index, 2);
350
351 // Click spans two tokens.
352 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200353 {6, 17},
354 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100355 EXPECT_EQ(token_index, kInvalidIndex);
356}
357
358TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100359 int token_index;
360
361 // Selection of length 3. Exactly aligned indices.
362 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200363 {7, 27},
364 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
365 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100366 EXPECT_EQ(token_index, 2);
367
368 // Selection of length 1 token. Exactly aligned indices.
369 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200370 {21, 27},
371 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
372 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100373 EXPECT_EQ(token_index, 3);
374
375 // Selection marks sub-token range, with no tokens in it.
376 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200377 {29, 33},
378 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
379 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100380 EXPECT_EQ(token_index, kInvalidIndex);
381
382 // Selection of length 2. Sub-token indices.
383 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200384 {3, 25},
385 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
386 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100387 EXPECT_EQ(token_index, 1);
388
389 // Selection of length 1. Sub-token indices.
390 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200391 {22, 34},
392 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
393 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100394 EXPECT_EQ(token_index, 4);
Alex Salcianu9087f1f2017-03-22 21:22:39 -0400395
396 // Some invalid ones.
397 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
398 EXPECT_EQ(token_index, -1);
399}
400
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200401TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
402 FeatureProcessorOptions options;
403 options.set_context_size(2);
404 options.set_max_selection_span(2);
405 options.set_snap_label_span_boundaries_to_containing_tokens(false);
406
407 TokenizationCodepointRange* config =
408 options.add_tokenization_codepoint_config();
409 config->set_start(32);
410 config->set_end(33);
411 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
412
413 FeatureProcessorOptions::CodepointRange* range;
414 range = options.add_supported_codepoint_ranges();
415 range->set_start(0);
416 range->set_end(128);
417
418 range = options.add_supported_codepoint_ranges();
419 range->set_start(10000);
420 range->set_end(10001);
421
422 range = options.add_supported_codepoint_ranges();
423 range->set_start(20000);
424 range->set_end(30000);
425
426 TestingFeatureProcessor feature_processor(options);
427 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
428 1, feature_processor.Tokenize("aaa bbb ccc")),
429 FloatEq(1.0));
430 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
431 1, feature_processor.Tokenize("aaa bbb ěěě")),
432 FloatEq(2.0 / 3));
433 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
434 1, feature_processor.Tokenize("ěěě řřř ěěě")),
435 FloatEq(0.0));
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200436 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
437 -1, feature_processor.supported_codepoint_ranges_));
438 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
439 0, feature_processor.supported_codepoint_ranges_));
440 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
441 10, feature_processor.supported_codepoint_ranges_));
442 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
443 127, feature_processor.supported_codepoint_ranges_));
444 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
445 128, feature_processor.supported_codepoint_ranges_));
446 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
447 9999, feature_processor.supported_codepoint_ranges_));
448 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
449 10000, feature_processor.supported_codepoint_ranges_));
450 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
451 10001, feature_processor.supported_codepoint_ranges_));
452 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
453 25000, feature_processor.supported_codepoint_ranges_));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200454
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200455 std::vector<Token> tokens;
456 int click_pos;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200457 std::vector<float> extra_features;
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200458 std::unique_ptr<CachedFeatures> cached_features;
459
460 auto feature_fn = [](const std::vector<int>& sparse_features,
461 const std::vector<float>& dense_features,
462 float* embedding) { return true; };
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200463
464 options.set_min_supported_codepoint_ratio(0.0);
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200465 TestingFeatureProcessor feature_processor2(options);
466 EXPECT_TRUE(feature_processor2.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
467 feature_fn, 2, &tokens,
468 &click_pos, &cached_features));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200469
470 options.set_min_supported_codepoint_ratio(0.2);
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200471 TestingFeatureProcessor feature_processor3(options);
472 EXPECT_TRUE(feature_processor3.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
473 feature_fn, 2, &tokens,
474 &click_pos, &cached_features));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200475
476 options.set_min_supported_codepoint_ratio(0.5);
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200477 TestingFeatureProcessor feature_processor4(options);
478 EXPECT_FALSE(feature_processor4.ExtractFeatures(
479 "ěěě řřř eee", {4, 7}, {0, 0}, feature_fn, 2, &tokens, &click_pos,
480 &cached_features));
481}
482
483TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
484 std::vector<Token> tokens_orig{
485 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
486 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
487 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
488 Token("12", 0, 0)};
489
490 std::vector<Token> tokens;
491 int click_index;
492
493 // Try to click first token and see if it gets padded from left.
494 tokens = tokens_orig;
495 click_index = 0;
496 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
497 // clang-format off
498 EXPECT_EQ(tokens, std::vector<Token>({Token(),
499 Token(),
500 Token("0", 0, 0),
501 Token("1", 0, 0),
502 Token("2", 0, 0)}));
503 // clang-format on
504 EXPECT_EQ(click_index, 2);
505
506 // When we click the second token nothing should get padded.
507 tokens = tokens_orig;
508 click_index = 2;
509 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
510 // clang-format off
511 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
512 Token("1", 0, 0),
513 Token("2", 0, 0),
514 Token("3", 0, 0),
515 Token("4", 0, 0)}));
516 // clang-format on
517 EXPECT_EQ(click_index, 2);
518
519 // When we click the last token tokens should get padded from the right.
520 tokens = tokens_orig;
521 click_index = 12;
522 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
523 // clang-format off
524 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
525 Token("11", 0, 0),
526 Token("12", 0, 0),
527 Token(),
528 Token()}));
529 // clang-format on
530 EXPECT_EQ(click_index, 2);
531}
532
533TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
534 std::vector<Token> tokens_orig{
535 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
536 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
537 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
538 Token("12", 0, 0)};
539
540 std::vector<Token> tokens;
541 int click_index;
542
543 // Try to click first token and see if it gets padded from left to maximum
544 // context_size.
545 tokens = tokens_orig;
546 click_index = 0;
547 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
548 // clang-format off
549 EXPECT_EQ(tokens, std::vector<Token>({Token(),
550 Token(),
551 Token("0", 0, 0),
552 Token("1", 0, 0),
553 Token("2", 0, 0),
554 Token("3", 0, 0),
555 Token("4", 0, 0),
556 Token("5", 0, 0)}));
557 // clang-format on
558 EXPECT_EQ(click_index, 2);
559
560 // Clicking to the middle with enough context should not produce any padding.
561 tokens = tokens_orig;
562 click_index = 6;
563 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
564 // clang-format off
565 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
566 Token("2", 0, 0),
567 Token("3", 0, 0),
568 Token("4", 0, 0),
569 Token("5", 0, 0),
570 Token("6", 0, 0),
571 Token("7", 0, 0),
572 Token("8", 0, 0),
573 Token("9", 0, 0)}));
574 // clang-format on
575 EXPECT_EQ(click_index, 5);
576
577 // Clicking at the end should pad right to maximum context_size.
578 tokens = tokens_orig;
579 click_index = 11;
580 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
581 // clang-format off
582 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
583 Token("7", 0, 0),
584 Token("8", 0, 0),
585 Token("9", 0, 0),
586 Token("10", 0, 0),
587 Token("11", 0, 0),
588 Token("12", 0, 0),
589 Token(),
590 Token()}));
591 // clang-format on
592 EXPECT_EQ(click_index, 5);
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200593}
594
Lukas Zilka40c18de2017-04-10 17:22:22 +0200595TEST(FeatureProcessorTest, ICUTokenize) {
596 FeatureProcessorOptions options;
597 options.set_tokenization_type(
598 libtextclassifier::FeatureProcessorOptions::ICU);
599
600 TestingFeatureProcessor feature_processor(options);
601 std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
602 ASSERT_EQ(tokens,
603 // clang-format off
604 std::vector<Token>({Token("พระบาท", 0, 6),
605 Token("สมเด็จ", 6, 12),
606 Token("พระ", 12, 15),
607 Token("ปร", 15, 17),
608 Token("มิ", 17, 19)}));
609 // clang-format on
610}
611
612TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
613 FeatureProcessorOptions options;
614 options.set_tokenization_type(
615 libtextclassifier::FeatureProcessorOptions::ICU);
616 options.set_icu_preserve_whitespace_tokens(true);
617
618 TestingFeatureProcessor feature_processor(options);
619 std::vector<Token> tokens =
620 feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
621 ASSERT_EQ(tokens,
622 // clang-format off
623 std::vector<Token>({Token("พระบาท", 0, 6),
624 Token(" ", 6, 7),
625 Token("สมเด็จ", 7, 13),
626 Token(" ", 13, 14),
627 Token("พระ", 14, 17),
628 Token(" ", 17, 18),
629 Token("ปร", 18, 20),
630 Token(" ", 20, 21),
631 Token("มิ", 21, 23)}));
632 // clang-format on
633}
634
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200635TEST(FeatureProcessorTest, MixedTokenize) {
636 FeatureProcessorOptions options;
637 options.set_tokenization_type(
638 libtextclassifier::FeatureProcessorOptions::MIXED);
639
640 TokenizationCodepointRange* config =
641 options.add_tokenization_codepoint_config();
642 config->set_start(32);
643 config->set_end(33);
644 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
645
646 FeatureProcessorOptions::CodepointRange* range;
647 range = options.add_internal_tokenizer_codepoint_ranges();
648 range->set_start(0);
649 range->set_end(128);
650
651 range = options.add_internal_tokenizer_codepoint_ranges();
652 range->set_start(128);
653 range->set_end(256);
654
655 range = options.add_internal_tokenizer_codepoint_ranges();
656 range->set_start(256);
657 range->set_end(384);
658
659 range = options.add_internal_tokenizer_codepoint_ranges();
660 range->set_start(384);
661 range->set_end(592);
662
663 TestingFeatureProcessor feature_processor(options);
664 std::vector<Token> tokens = feature_processor.Tokenize(
665 "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
666 ASSERT_EQ(tokens,
667 // clang-format off
668 std::vector<Token>({Token("こんにちは", 0, 5),
669 Token("Japanese-ląnguagę", 5, 22),
670 Token("text", 23, 27),
671 Token("世界", 28, 30),
672 Token("http://www.google.com/", 31, 53)}));
673 // clang-format on
674}
675
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200676TEST(FeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {
677 FeatureProcessorOptions options;
678 options.add_ignored_span_boundary_codepoints('.');
679 options.add_ignored_span_boundary_codepoints(',');
680 options.add_ignored_span_boundary_codepoints('[');
681 options.add_ignored_span_boundary_codepoints(']');
682
683 TestingFeatureProcessor feature_processor(options);
684
685 const std::string text1_utf8 = "ěščř";
686 const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);
687 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
688 text1.begin(), text1.end(),
689 /*count_from_beginning=*/true),
690 0);
691 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
692 text1.begin(), text1.end(),
693 /*count_from_beginning=*/false),
694 0);
695
696 const std::string text2_utf8 = ".,abčd";
697 const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);
698 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
699 text2.begin(), text2.end(),
700 /*count_from_beginning=*/true),
701 2);
702 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
703 text2.begin(), text2.end(),
704 /*count_from_beginning=*/false),
705 0);
706
707 const std::string text3_utf8 = ".,abčd[]";
708 const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);
709 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
710 text3.begin(), text3.end(),
711 /*count_from_beginning=*/true),
712 2);
713 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
714 text3.begin(), text3.end(),
715 /*count_from_beginning=*/false),
716 2);
717
718 const std::string text4_utf8 = "[abčd]";
719 const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);
720 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
721 text4.begin(), text4.end(),
722 /*count_from_beginning=*/true),
723 1);
724 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
725 text4.begin(), text4.end(),
726 /*count_from_beginning=*/false),
727 1);
728
729 const std::string text5_utf8 = "";
730 const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);
731 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
732 text5.begin(), text5.end(),
733 /*count_from_beginning=*/true),
734 0);
735 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
736 text5.begin(), text5.end(),
737 /*count_from_beginning=*/false),
738 0);
739
740 const std::string text6_utf8 = "012345ěščř";
741 const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);
742 UnicodeText::const_iterator text6_begin = text6.begin();
743 std::advance(text6_begin, 6);
744 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
745 text6_begin, text6.end(),
746 /*count_from_beginning=*/true),
747 0);
748 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
749 text6_begin, text6.end(),
750 /*count_from_beginning=*/false),
751 0);
752
753 const std::string text7_utf8 = "012345.,ěščř";
754 const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);
755 UnicodeText::const_iterator text7_begin = text7.begin();
756 std::advance(text7_begin, 6);
757 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
758 text7_begin, text7.end(),
759 /*count_from_beginning=*/true),
760 2);
761 UnicodeText::const_iterator text7_end = text7.begin();
762 std::advance(text7_end, 8);
763 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
764 text7.begin(), text7_end,
765 /*count_from_beginning=*/false),
766 2);
767
768 // Test not stripping.
769 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
770 "Hello [[[Wořld]] or not?", {0, 24}),
771 std::make_pair(0, 24));
772 // Test basic stripping.
773 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
774 "Hello [[[Wořld]] or not?", {6, 16}),
775 std::make_pair(9, 14));
776 // Test stripping when everything is stripped.
777 EXPECT_EQ(
778 feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),
779 std::make_pair(6, 6));
780 // Test stripping empty string.
781 EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),
782 std::make_pair(0, 0));
783}
784
Matt Sharifid40f9762017-03-14 21:24:23 +0100785} // namespace
786} // namespace libtextclassifier