blob: 9bee67a8145580ac6211a896e77df77cc62b193f [file] [log] [blame]
Matt Sharifid40f9762017-03-14 21:24:23 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "smartselect/feature-processor.h"
18
19#include "gmock/gmock.h"
20#include "gtest/gtest.h"
21
22namespace libtextclassifier {
23namespace {
24
25using testing::ElementsAreArray;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +020026using testing::FloatEq;
Matt Sharifid40f9762017-03-14 21:24:23 +010027
Lukas Zilka726b4d22017-12-13 16:37:03 +010028class TestingFeatureProcessor : public FeatureProcessor {
29 public:
30 using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
31 using FeatureProcessor::FeatureProcessor;
32 using FeatureProcessor::ICUTokenize;
33 using FeatureProcessor::IsCodepointInRanges;
34 using FeatureProcessor::SpanToLabel;
35 using FeatureProcessor::StripTokensFromOtherLines;
36 using FeatureProcessor::supported_codepoint_ranges_;
37 using FeatureProcessor::SupportedCodepointsRatio;
38};
39
Matt Sharifid40f9762017-03-14 21:24:23 +010040TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020041 std::vector<Token> tokens{Token("Hělló", 0, 5),
42 Token("fěěbař@google.com", 6, 23),
43 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010044
45 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
46
47 // clang-format off
48 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020049 {Token("Hělló", 0, 5),
50 Token("fěě", 6, 9),
51 Token("bař", 9, 12),
52 Token("@google.com", 12, 23),
53 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010054 // clang-format on
55}
56
57TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020058 std::vector<Token> tokens{Token("Hělló", 0, 5),
59 Token("fěěbař@google.com", 6, 23),
60 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010061
62 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
63
64 // clang-format off
65 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020066 {Token("Hělló", 0, 5),
67 Token("fěěbař", 6, 12),
68 Token("@google.com", 12, 23),
69 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010070 // clang-format on
71}
72
73TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020074 std::vector<Token> tokens{Token("Hělló", 0, 5),
75 Token("fěěbař@google.com", 6, 23),
76 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010077
78 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
79
80 // clang-format off
81 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020082 {Token("Hělló", 0, 5),
83 Token("fěě", 6, 9),
84 Token("bař@google.com", 9, 23),
85 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010086 // clang-format on
87}
88
89TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020090 std::vector<Token> tokens{Token("Hělló", 0, 5),
91 Token("fěěbař@google.com", 6, 23),
92 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010093
94 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
95
96 // clang-format off
97 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020098 {Token("Hělló", 0, 5),
99 Token("fěěbař@google.com", 6, 23),
100 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100101 // clang-format on
102}
103
104TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200105 std::vector<Token> tokens{Token("Hělló", 0, 5),
106 Token("fěěbař@google.com", 6, 23),
107 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100108
109 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
110
111 // clang-format off
112 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200113 {Token("Hě", 0, 2),
114 Token("lló", 2, 5),
115 Token("fěě", 6, 9),
116 Token("bař@google.com", 9, 23),
117 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100118 // clang-format on
119}
120
121TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
Lukas Zilka726b4d22017-12-13 16:37:03 +0100122 FeatureProcessorOptions options;
123 options.set_only_use_line_with_click(true);
124 TestingFeatureProcessor feature_processor(options);
125
Matt Sharifibe876dc2017-03-17 17:02:43 +0100126 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
127 const CodepointSpan span = {0, 5};
128 // clang-format off
129 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
130 Token("Lině", 6, 10),
131 Token("Sěcond", 11, 17),
132 Token("Lině", 18, 22),
133 Token("Thiřd", 23, 28),
134 Token("Lině", 29, 33)};
135 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100136
137 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100138 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100139 EXPECT_THAT(tokens,
140 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100141}
142
143TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
Lukas Zilka726b4d22017-12-13 16:37:03 +0100144 FeatureProcessorOptions options;
145 options.set_only_use_line_with_click(true);
146 TestingFeatureProcessor feature_processor(options);
147
Matt Sharifibe876dc2017-03-17 17:02:43 +0100148 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
149 const CodepointSpan span = {18, 22};
150 // clang-format off
151 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
152 Token("Lině", 6, 10),
153 Token("Sěcond", 11, 17),
154 Token("Lině", 18, 22),
155 Token("Thiřd", 23, 28),
156 Token("Lině", 29, 33)};
157 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100158
Matt Sharifibe876dc2017-03-17 17:02:43 +0100159 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100160 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100161 EXPECT_THAT(tokens, ElementsAreArray(
162 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100163}
164
165TEST(FeatureProcessorTest, KeepLineWithClickThird) {
Lukas Zilka726b4d22017-12-13 16:37:03 +0100166 FeatureProcessorOptions options;
167 options.set_only_use_line_with_click(true);
168 TestingFeatureProcessor feature_processor(options);
169
Matt Sharifibe876dc2017-03-17 17:02:43 +0100170 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
171 const CodepointSpan span = {24, 33};
172 // clang-format off
173 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
174 Token("Lině", 6, 10),
175 Token("Sěcond", 11, 17),
176 Token("Lině", 18, 22),
177 Token("Thiřd", 23, 28),
178 Token("Lině", 29, 33)};
179 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100180
Matt Sharifibe876dc2017-03-17 17:02:43 +0100181 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100182 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100183 EXPECT_THAT(tokens, ElementsAreArray(
184 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100185}
186
187TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
Lukas Zilka726b4d22017-12-13 16:37:03 +0100188 FeatureProcessorOptions options;
189 options.set_only_use_line_with_click(true);
190 TestingFeatureProcessor feature_processor(options);
191
Matt Sharifibe876dc2017-03-17 17:02:43 +0100192 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
193 const CodepointSpan span = {18, 22};
194 // clang-format off
195 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
196 Token("Lině", 6, 10),
197 Token("Sěcond", 11, 17),
198 Token("Lině", 18, 22),
199 Token("Thiřd", 23, 28),
200 Token("Lině", 29, 33)};
201 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100202
Matt Sharifibe876dc2017-03-17 17:02:43 +0100203 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100204 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100205 EXPECT_THAT(tokens, ElementsAreArray(
206 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100207}
208
209TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
Lukas Zilka726b4d22017-12-13 16:37:03 +0100210 FeatureProcessorOptions options;
211 options.set_only_use_line_with_click(true);
212 TestingFeatureProcessor feature_processor(options);
213
Matt Sharifibe876dc2017-03-17 17:02:43 +0100214 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
215 const CodepointSpan span = {5, 23};
216 // clang-format off
217 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
218 Token("Lině", 6, 10),
219 Token("Sěcond", 18, 23),
220 Token("Lině", 19, 23),
221 Token("Thiřd", 23, 28),
222 Token("Lině", 29, 33)};
223 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100224
Matt Sharifibe876dc2017-03-17 17:02:43 +0100225 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100226 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100227 EXPECT_THAT(tokens, ElementsAreArray(
228 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
229 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
230 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100231}
232
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200233TEST(FeatureProcessorTest, SpanToLabel) {
234 FeatureProcessorOptions options;
235 options.set_context_size(1);
236 options.set_max_selection_span(1);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200237 options.set_snap_label_span_boundaries_to_containing_tokens(false);
238
239 TokenizationCodepointRange* config =
240 options.add_tokenization_codepoint_config();
241 config->set_start(32);
242 config->set_end(33);
243 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
244
245 TestingFeatureProcessor feature_processor(options);
246 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
247 ASSERT_EQ(3, tokens.size());
248 int label;
249 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
250 EXPECT_EQ(kInvalidLabel, label);
251 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
252 EXPECT_NE(kInvalidLabel, label);
253 TokenSpan token_span;
254 feature_processor.LabelToTokenSpan(label, &token_span);
255 EXPECT_EQ(0, token_span.first);
256 EXPECT_EQ(0, token_span.second);
257
258 // Reconfigure with snapping enabled.
259 options.set_snap_label_span_boundaries_to_containing_tokens(true);
260 TestingFeatureProcessor feature_processor2(options);
261 int label2;
262 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
263 EXPECT_EQ(label, label2);
264 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
265 EXPECT_EQ(label, label2);
266 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
267 EXPECT_EQ(label, label2);
268
269 // Cross a token boundary.
270 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
271 EXPECT_EQ(kInvalidLabel, label2);
272 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
273 EXPECT_EQ(kInvalidLabel, label2);
274
275 // Multiple tokens.
276 options.set_context_size(2);
277 options.set_max_selection_span(2);
278 TestingFeatureProcessor feature_processor3(options);
279 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
280 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
281 EXPECT_NE(kInvalidLabel, label2);
282 feature_processor3.LabelToTokenSpan(label2, &token_span);
283 EXPECT_EQ(1, token_span.first);
284 EXPECT_EQ(0, token_span.second);
285
286 int label3;
287 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
288 EXPECT_EQ(label2, label3);
289 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
290 EXPECT_EQ(label2, label3);
291 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
292 EXPECT_EQ(label2, label3);
293}
294
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200295TEST(FeatureProcessorTest, SpanToLabelIgnoresPunctuation) {
296 FeatureProcessorOptions options;
297 options.set_context_size(1);
298 options.set_max_selection_span(1);
299 options.set_snap_label_span_boundaries_to_containing_tokens(false);
300
301 TokenizationCodepointRange* config =
302 options.add_tokenization_codepoint_config();
303 config->set_start(32);
304 config->set_end(33);
305 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
306
307 TestingFeatureProcessor feature_processor(options);
308 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
309 ASSERT_EQ(3, tokens.size());
310 int label;
311 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
312 EXPECT_EQ(kInvalidLabel, label);
313 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
314 EXPECT_NE(kInvalidLabel, label);
315 TokenSpan token_span;
316 feature_processor.LabelToTokenSpan(label, &token_span);
317 EXPECT_EQ(0, token_span.first);
318 EXPECT_EQ(0, token_span.second);
319
320 // Reconfigure with snapping enabled.
321 options.set_snap_label_span_boundaries_to_containing_tokens(true);
322 TestingFeatureProcessor feature_processor2(options);
323 int label2;
324 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
325 EXPECT_EQ(label, label2);
326 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
327 EXPECT_EQ(label, label2);
328 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
329 EXPECT_EQ(label, label2);
330
331 // Cross a token boundary.
332 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
333 EXPECT_EQ(kInvalidLabel, label2);
334 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
335 EXPECT_EQ(kInvalidLabel, label2);
336
337 // Multiple tokens.
338 options.set_context_size(2);
339 options.set_max_selection_span(2);
340 TestingFeatureProcessor feature_processor3(options);
341 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
342 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
343 EXPECT_NE(kInvalidLabel, label2);
344 feature_processor3.LabelToTokenSpan(label2, &token_span);
345 EXPECT_EQ(1, token_span.first);
346 EXPECT_EQ(0, token_span.second);
347
348 int label3;
349 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
350 EXPECT_EQ(label2, label3);
351 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
352 EXPECT_EQ(label2, label3);
353 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
354 EXPECT_EQ(label2, label3);
355}
356
Matt Sharifibe876dc2017-03-17 17:02:43 +0100357TEST(FeatureProcessorTest, CenterTokenFromClick) {
358 int token_index;
359
360 // Exactly aligned indices.
361 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200362 {6, 11},
363 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100364 EXPECT_EQ(token_index, 1);
365
366 // Click is contained in a token.
367 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200368 {13, 17},
369 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100370 EXPECT_EQ(token_index, 2);
371
372 // Click spans two tokens.
373 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200374 {6, 17},
375 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100376 EXPECT_EQ(token_index, kInvalidIndex);
377}
378
379TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100380 int token_index;
381
382 // Selection of length 3. Exactly aligned indices.
383 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200384 {7, 27},
385 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
386 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100387 EXPECT_EQ(token_index, 2);
388
389 // Selection of length 1 token. Exactly aligned indices.
390 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200391 {21, 27},
392 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
393 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100394 EXPECT_EQ(token_index, 3);
395
396 // Selection marks sub-token range, with no tokens in it.
397 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200398 {29, 33},
399 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
400 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100401 EXPECT_EQ(token_index, kInvalidIndex);
402
403 // Selection of length 2. Sub-token indices.
404 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200405 {3, 25},
406 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
407 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100408 EXPECT_EQ(token_index, 1);
409
410 // Selection of length 1. Sub-token indices.
411 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200412 {22, 34},
413 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
414 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100415 EXPECT_EQ(token_index, 4);
Alex Salcianu9087f1f2017-03-22 21:22:39 -0400416
417 // Some invalid ones.
418 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
419 EXPECT_EQ(token_index, -1);
420}
421
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200422TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
423 FeatureProcessorOptions options;
424 options.set_context_size(2);
425 options.set_max_selection_span(2);
426 options.set_snap_label_span_boundaries_to_containing_tokens(false);
427
428 TokenizationCodepointRange* config =
429 options.add_tokenization_codepoint_config();
430 config->set_start(32);
431 config->set_end(33);
432 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
433
434 FeatureProcessorOptions::CodepointRange* range;
435 range = options.add_supported_codepoint_ranges();
436 range->set_start(0);
437 range->set_end(128);
438
439 range = options.add_supported_codepoint_ranges();
440 range->set_start(10000);
441 range->set_end(10001);
442
443 range = options.add_supported_codepoint_ranges();
444 range->set_start(20000);
445 range->set_end(30000);
446
447 TestingFeatureProcessor feature_processor(options);
448 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
449 1, feature_processor.Tokenize("aaa bbb ccc")),
450 FloatEq(1.0));
451 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
452 1, feature_processor.Tokenize("aaa bbb ěěě")),
453 FloatEq(2.0 / 3));
454 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
455 1, feature_processor.Tokenize("ěěě řřř ěěě")),
456 FloatEq(0.0));
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200457 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
458 -1, feature_processor.supported_codepoint_ranges_));
459 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
460 0, feature_processor.supported_codepoint_ranges_));
461 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
462 10, feature_processor.supported_codepoint_ranges_));
463 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
464 127, feature_processor.supported_codepoint_ranges_));
465 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
466 128, feature_processor.supported_codepoint_ranges_));
467 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
468 9999, feature_processor.supported_codepoint_ranges_));
469 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
470 10000, feature_processor.supported_codepoint_ranges_));
471 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
472 10001, feature_processor.supported_codepoint_ranges_));
473 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
474 25000, feature_processor.supported_codepoint_ranges_));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200475
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200476 std::vector<Token> tokens;
477 int click_pos;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200478 std::vector<float> extra_features;
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200479 std::unique_ptr<CachedFeatures> cached_features;
480
481 auto feature_fn = [](const std::vector<int>& sparse_features,
482 const std::vector<float>& dense_features,
483 float* embedding) { return true; };
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200484
485 options.set_min_supported_codepoint_ratio(0.0);
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200486 TestingFeatureProcessor feature_processor2(options);
487 EXPECT_TRUE(feature_processor2.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
488 feature_fn, 2, &tokens,
489 &click_pos, &cached_features));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200490
491 options.set_min_supported_codepoint_ratio(0.2);
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200492 TestingFeatureProcessor feature_processor3(options);
493 EXPECT_TRUE(feature_processor3.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
494 feature_fn, 2, &tokens,
495 &click_pos, &cached_features));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200496
497 options.set_min_supported_codepoint_ratio(0.5);
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200498 TestingFeatureProcessor feature_processor4(options);
499 EXPECT_FALSE(feature_processor4.ExtractFeatures(
500 "ěěě řřř eee", {4, 7}, {0, 0}, feature_fn, 2, &tokens, &click_pos,
501 &cached_features));
502}
503
504TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
505 std::vector<Token> tokens_orig{
506 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
507 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
508 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
509 Token("12", 0, 0)};
510
511 std::vector<Token> tokens;
512 int click_index;
513
514 // Try to click first token and see if it gets padded from left.
515 tokens = tokens_orig;
516 click_index = 0;
517 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
518 // clang-format off
519 EXPECT_EQ(tokens, std::vector<Token>({Token(),
520 Token(),
521 Token("0", 0, 0),
522 Token("1", 0, 0),
523 Token("2", 0, 0)}));
524 // clang-format on
525 EXPECT_EQ(click_index, 2);
526
527 // When we click the second token nothing should get padded.
528 tokens = tokens_orig;
529 click_index = 2;
530 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
531 // clang-format off
532 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
533 Token("1", 0, 0),
534 Token("2", 0, 0),
535 Token("3", 0, 0),
536 Token("4", 0, 0)}));
537 // clang-format on
538 EXPECT_EQ(click_index, 2);
539
540 // When we click the last token tokens should get padded from the right.
541 tokens = tokens_orig;
542 click_index = 12;
543 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
544 // clang-format off
545 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
546 Token("11", 0, 0),
547 Token("12", 0, 0),
548 Token(),
549 Token()}));
550 // clang-format on
551 EXPECT_EQ(click_index, 2);
552}
553
554TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
555 std::vector<Token> tokens_orig{
556 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
557 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
558 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
559 Token("12", 0, 0)};
560
561 std::vector<Token> tokens;
562 int click_index;
563
564 // Try to click first token and see if it gets padded from left to maximum
565 // context_size.
566 tokens = tokens_orig;
567 click_index = 0;
568 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
569 // clang-format off
570 EXPECT_EQ(tokens, std::vector<Token>({Token(),
571 Token(),
572 Token("0", 0, 0),
573 Token("1", 0, 0),
574 Token("2", 0, 0),
575 Token("3", 0, 0),
576 Token("4", 0, 0),
577 Token("5", 0, 0)}));
578 // clang-format on
579 EXPECT_EQ(click_index, 2);
580
581 // Clicking to the middle with enough context should not produce any padding.
582 tokens = tokens_orig;
583 click_index = 6;
584 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
585 // clang-format off
586 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
587 Token("2", 0, 0),
588 Token("3", 0, 0),
589 Token("4", 0, 0),
590 Token("5", 0, 0),
591 Token("6", 0, 0),
592 Token("7", 0, 0),
593 Token("8", 0, 0),
594 Token("9", 0, 0)}));
595 // clang-format on
596 EXPECT_EQ(click_index, 5);
597
598 // Clicking at the end should pad right to maximum context_size.
599 tokens = tokens_orig;
600 click_index = 11;
601 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
602 // clang-format off
603 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
604 Token("7", 0, 0),
605 Token("8", 0, 0),
606 Token("9", 0, 0),
607 Token("10", 0, 0),
608 Token("11", 0, 0),
609 Token("12", 0, 0),
610 Token(),
611 Token()}));
612 // clang-format on
613 EXPECT_EQ(click_index, 5);
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200614}
615
Lukas Zilka40c18de2017-04-10 17:22:22 +0200616TEST(FeatureProcessorTest, ICUTokenize) {
617 FeatureProcessorOptions options;
618 options.set_tokenization_type(
619 libtextclassifier::FeatureProcessorOptions::ICU);
620
621 TestingFeatureProcessor feature_processor(options);
622 std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
623 ASSERT_EQ(tokens,
624 // clang-format off
625 std::vector<Token>({Token("พระบาท", 0, 6),
626 Token("สมเด็จ", 6, 12),
627 Token("พระ", 12, 15),
628 Token("ปร", 15, 17),
629 Token("มิ", 17, 19)}));
630 // clang-format on
631}
632
633TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
634 FeatureProcessorOptions options;
635 options.set_tokenization_type(
636 libtextclassifier::FeatureProcessorOptions::ICU);
637 options.set_icu_preserve_whitespace_tokens(true);
638
639 TestingFeatureProcessor feature_processor(options);
640 std::vector<Token> tokens =
641 feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
642 ASSERT_EQ(tokens,
643 // clang-format off
644 std::vector<Token>({Token("พระบาท", 0, 6),
645 Token(" ", 6, 7),
646 Token("สมเด็จ", 7, 13),
647 Token(" ", 13, 14),
648 Token("พระ", 14, 17),
649 Token(" ", 17, 18),
650 Token("ปร", 18, 20),
651 Token(" ", 20, 21),
652 Token("มิ", 21, 23)}));
653 // clang-format on
654}
655
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200656TEST(FeatureProcessorTest, MixedTokenize) {
657 FeatureProcessorOptions options;
658 options.set_tokenization_type(
659 libtextclassifier::FeatureProcessorOptions::MIXED);
660
661 TokenizationCodepointRange* config =
662 options.add_tokenization_codepoint_config();
663 config->set_start(32);
664 config->set_end(33);
665 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
666
667 FeatureProcessorOptions::CodepointRange* range;
668 range = options.add_internal_tokenizer_codepoint_ranges();
669 range->set_start(0);
670 range->set_end(128);
671
672 range = options.add_internal_tokenizer_codepoint_ranges();
673 range->set_start(128);
674 range->set_end(256);
675
676 range = options.add_internal_tokenizer_codepoint_ranges();
677 range->set_start(256);
678 range->set_end(384);
679
680 range = options.add_internal_tokenizer_codepoint_ranges();
681 range->set_start(384);
682 range->set_end(592);
683
684 TestingFeatureProcessor feature_processor(options);
685 std::vector<Token> tokens = feature_processor.Tokenize(
686 "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
687 ASSERT_EQ(tokens,
688 // clang-format off
689 std::vector<Token>({Token("こんにちは", 0, 5),
690 Token("Japanese-ląnguagę", 5, 22),
691 Token("text", 23, 27),
692 Token("世界", 28, 30),
693 Token("http://www.google.com/", 31, 53)}));
694 // clang-format on
695}
696
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200697TEST(FeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {
698 FeatureProcessorOptions options;
699 options.add_ignored_span_boundary_codepoints('.');
700 options.add_ignored_span_boundary_codepoints(',');
701 options.add_ignored_span_boundary_codepoints('[');
702 options.add_ignored_span_boundary_codepoints(']');
703
704 TestingFeatureProcessor feature_processor(options);
705
706 const std::string text1_utf8 = "ěščř";
707 const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);
708 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
709 text1.begin(), text1.end(),
710 /*count_from_beginning=*/true),
711 0);
712 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
713 text1.begin(), text1.end(),
714 /*count_from_beginning=*/false),
715 0);
716
717 const std::string text2_utf8 = ".,abčd";
718 const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);
719 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
720 text2.begin(), text2.end(),
721 /*count_from_beginning=*/true),
722 2);
723 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
724 text2.begin(), text2.end(),
725 /*count_from_beginning=*/false),
726 0);
727
728 const std::string text3_utf8 = ".,abčd[]";
729 const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);
730 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
731 text3.begin(), text3.end(),
732 /*count_from_beginning=*/true),
733 2);
734 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
735 text3.begin(), text3.end(),
736 /*count_from_beginning=*/false),
737 2);
738
739 const std::string text4_utf8 = "[abčd]";
740 const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);
741 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
742 text4.begin(), text4.end(),
743 /*count_from_beginning=*/true),
744 1);
745 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
746 text4.begin(), text4.end(),
747 /*count_from_beginning=*/false),
748 1);
749
750 const std::string text5_utf8 = "";
751 const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);
752 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
753 text5.begin(), text5.end(),
754 /*count_from_beginning=*/true),
755 0);
756 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
757 text5.begin(), text5.end(),
758 /*count_from_beginning=*/false),
759 0);
760
761 const std::string text6_utf8 = "012345ěščř";
762 const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);
763 UnicodeText::const_iterator text6_begin = text6.begin();
764 std::advance(text6_begin, 6);
765 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
766 text6_begin, text6.end(),
767 /*count_from_beginning=*/true),
768 0);
769 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
770 text6_begin, text6.end(),
771 /*count_from_beginning=*/false),
772 0);
773
774 const std::string text7_utf8 = "012345.,ěščř";
775 const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);
776 UnicodeText::const_iterator text7_begin = text7.begin();
777 std::advance(text7_begin, 6);
778 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
779 text7_begin, text7.end(),
780 /*count_from_beginning=*/true),
781 2);
782 UnicodeText::const_iterator text7_end = text7.begin();
783 std::advance(text7_end, 8);
784 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
785 text7.begin(), text7_end,
786 /*count_from_beginning=*/false),
787 2);
788
789 // Test not stripping.
790 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
791 "Hello [[[Wořld]] or not?", {0, 24}),
792 std::make_pair(0, 24));
793 // Test basic stripping.
794 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
795 "Hello [[[Wořld]] or not?", {6, 16}),
796 std::make_pair(9, 14));
797 // Test stripping when everything is stripped.
798 EXPECT_EQ(
799 feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),
800 std::make_pair(6, 6));
801 // Test stripping empty string.
802 EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),
803 std::make_pair(0, 0));
804}
805
Lukas Zilka726b4d22017-12-13 16:37:03 +0100806TEST(FeatureProcessorTest, CodepointSpanToTokenSpan) {
807 const std::vector<Token> tokens{Token("Hělló", 0, 5),
808 Token("fěěbař@google.com", 6, 23),
809 Token("heře!", 24, 29)};
810
811 // Spans matching the tokens exactly.
812 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));
813 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));
814 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));
815 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));
816 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));
817 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));
818
819 // Snapping to containing tokens has no effect.
820 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));
821 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));
822 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));
823 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));
824 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));
825 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));
826
827 // Span boundaries inside tokens.
828 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));
829 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));
830
831 // Tokens adjacent to the span, but not overlapping.
832 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));
833 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));
834}
835
Matt Sharifid40f9762017-03-14 21:24:23 +0100836} // namespace
837} // namespace libtextclassifier