blob: 88a93f3806d1fa98f48390148aaac076c1bd373c [file] [log] [blame]
Matt Sharifid40f9762017-03-14 21:24:23 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "smartselect/feature-processor.h"
18
19#include "gmock/gmock.h"
20#include "gtest/gtest.h"
21
22namespace libtextclassifier {
23namespace {
24
25using testing::ElementsAreArray;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +020026using testing::FloatEq;
Matt Sharifid40f9762017-03-14 21:24:23 +010027
28TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
29 std::vector<Token> tokens{Token("Hělló", 0, 5, false),
30 Token("fěěbař@google.com", 6, 23, false),
31 Token("heře!", 24, 29, false)};
32
33 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
34
35 // clang-format off
36 EXPECT_THAT(tokens, ElementsAreArray(
37 {Token("Hělló", 0, 5, false),
38 Token("fěě", 6, 9, false),
39 Token("bař", 9, 12, false),
40 Token("@google.com", 12, 23, false),
41 Token("heře!", 24, 29, false)}));
42 // clang-format on
43}
44
45TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
46 std::vector<Token> tokens{Token("Hělló", 0, 5, false),
47 Token("fěěbař@google.com", 6, 23, false),
48 Token("heře!", 24, 29, false)};
49
50 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
51
52 // clang-format off
53 EXPECT_THAT(tokens, ElementsAreArray(
54 {Token("Hělló", 0, 5, false),
55 Token("fěěbař", 6, 12, false),
56 Token("@google.com", 12, 23, false),
57 Token("heře!", 24, 29, false)}));
58 // clang-format on
59}
60
61TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
62 std::vector<Token> tokens{Token("Hělló", 0, 5, false),
63 Token("fěěbař@google.com", 6, 23, false),
64 Token("heře!", 24, 29, false)};
65
66 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
67
68 // clang-format off
69 EXPECT_THAT(tokens, ElementsAreArray(
70 {Token("Hělló", 0, 5, false),
71 Token("fěě", 6, 9, false),
72 Token("bař@google.com", 9, 23, false),
73 Token("heře!", 24, 29, false)}));
74 // clang-format on
75}
76
77TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
78 std::vector<Token> tokens{Token("Hělló", 0, 5, false),
79 Token("fěěbař@google.com", 6, 23, false),
80 Token("heře!", 24, 29, false)};
81
82 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
83
84 // clang-format off
85 EXPECT_THAT(tokens, ElementsAreArray(
86 {Token("Hělló", 0, 5, false),
87 Token("fěěbař@google.com", 6, 23, false),
88 Token("heře!", 24, 29, false)}));
89 // clang-format on
90}
91
92TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
93 std::vector<Token> tokens{Token("Hělló", 0, 5, false),
94 Token("fěěbař@google.com", 6, 23, false),
95 Token("heře!", 24, 29, false)};
96
97 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
98
99 // clang-format off
100 EXPECT_THAT(tokens, ElementsAreArray(
101 {Token("Hě", 0, 2, false),
102 Token("lló", 2, 5, false),
103 Token("fěě", 6, 9, false),
104 Token("bař@google.com", 9, 23, false),
105 Token("heře!", 24, 29, false)}));
106 // clang-format on
107}
108
109TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100110 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
111 const CodepointSpan span = {0, 5};
112 // clang-format off
113 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
114 Token("Lině", 6, 10),
115 Token("Sěcond", 11, 17),
116 Token("Lině", 18, 22),
117 Token("Thiřd", 23, 28),
118 Token("Lině", 29, 33)};
119 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100120
121 // Keeps the first line.
Matt Sharifibe876dc2017-03-17 17:02:43 +0100122 internal::StripTokensFromOtherLines(context, span, &tokens);
123 EXPECT_THAT(tokens,
124 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100125}
126
127TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100128 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
129 const CodepointSpan span = {18, 22};
130 // clang-format off
131 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
132 Token("Lině", 6, 10),
133 Token("Sěcond", 11, 17),
134 Token("Lině", 18, 22),
135 Token("Thiřd", 23, 28),
136 Token("Lině", 29, 33)};
137 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100138
Matt Sharifibe876dc2017-03-17 17:02:43 +0100139 // Keeps the first line.
140 internal::StripTokensFromOtherLines(context, span, &tokens);
141 EXPECT_THAT(tokens, ElementsAreArray(
142 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100143}
144
145TEST(FeatureProcessorTest, KeepLineWithClickThird) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100146 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
147 const CodepointSpan span = {24, 33};
148 // clang-format off
149 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
150 Token("Lině", 6, 10),
151 Token("Sěcond", 11, 17),
152 Token("Lině", 18, 22),
153 Token("Thiřd", 23, 28),
154 Token("Lině", 29, 33)};
155 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100156
Matt Sharifibe876dc2017-03-17 17:02:43 +0100157 // Keeps the first line.
158 internal::StripTokensFromOtherLines(context, span, &tokens);
159 EXPECT_THAT(tokens, ElementsAreArray(
160 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100161}
162
163TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100164 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
165 const CodepointSpan span = {18, 22};
166 // clang-format off
167 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
168 Token("Lině", 6, 10),
169 Token("Sěcond", 11, 17),
170 Token("Lině", 18, 22),
171 Token("Thiřd", 23, 28),
172 Token("Lině", 29, 33)};
173 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100174
Matt Sharifibe876dc2017-03-17 17:02:43 +0100175 // Keeps the first line.
176 internal::StripTokensFromOtherLines(context, span, &tokens);
177 EXPECT_THAT(tokens, ElementsAreArray(
178 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100179}
180
181TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100182 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
183 const CodepointSpan span = {5, 23};
184 // clang-format off
185 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
186 Token("Lině", 6, 10),
187 Token("Sěcond", 18, 23),
188 Token("Lině", 19, 23),
189 Token("Thiřd", 23, 28),
190 Token("Lině", 29, 33)};
191 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100192
Matt Sharifibe876dc2017-03-17 17:02:43 +0100193 // Keeps the first line.
194 internal::StripTokensFromOtherLines(context, span, &tokens);
195 EXPECT_THAT(tokens, ElementsAreArray(
196 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
197 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
198 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100199}
200
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200201class TestingFeatureProcessor : public FeatureProcessor {
202 public:
203 using FeatureProcessor::FeatureProcessor;
204 using FeatureProcessor::SpanToLabel;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200205 using FeatureProcessor::SupportedCodepointsRatio;
206 using FeatureProcessor::IsCodepointSupported;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200207};
208
209TEST(FeatureProcessorTest, SpanToLabel) {
210 FeatureProcessorOptions options;
211 options.set_context_size(1);
212 options.set_max_selection_span(1);
213 options.set_tokenize_on_space(true);
214 options.set_snap_label_span_boundaries_to_containing_tokens(false);
215
216 TokenizationCodepointRange* config =
217 options.add_tokenization_codepoint_config();
218 config->set_start(32);
219 config->set_end(33);
220 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
221
222 TestingFeatureProcessor feature_processor(options);
223 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
224 ASSERT_EQ(3, tokens.size());
225 int label;
226 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
227 EXPECT_EQ(kInvalidLabel, label);
228 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
229 EXPECT_NE(kInvalidLabel, label);
230 TokenSpan token_span;
231 feature_processor.LabelToTokenSpan(label, &token_span);
232 EXPECT_EQ(0, token_span.first);
233 EXPECT_EQ(0, token_span.second);
234
235 // Reconfigure with snapping enabled.
236 options.set_snap_label_span_boundaries_to_containing_tokens(true);
237 TestingFeatureProcessor feature_processor2(options);
238 int label2;
239 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
240 EXPECT_EQ(label, label2);
241 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
242 EXPECT_EQ(label, label2);
243 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
244 EXPECT_EQ(label, label2);
245
246 // Cross a token boundary.
247 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
248 EXPECT_EQ(kInvalidLabel, label2);
249 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
250 EXPECT_EQ(kInvalidLabel, label2);
251
252 // Multiple tokens.
253 options.set_context_size(2);
254 options.set_max_selection_span(2);
255 TestingFeatureProcessor feature_processor3(options);
256 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
257 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
258 EXPECT_NE(kInvalidLabel, label2);
259 feature_processor3.LabelToTokenSpan(label2, &token_span);
260 EXPECT_EQ(1, token_span.first);
261 EXPECT_EQ(0, token_span.second);
262
263 int label3;
264 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
265 EXPECT_EQ(label2, label3);
266 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
267 EXPECT_EQ(label2, label3);
268 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
269 EXPECT_EQ(label2, label3);
270}
271
Matt Sharifid40f9762017-03-14 21:24:23 +0100272TEST(FeatureProcessorTest, GetFeaturesWithContextDropout) {
273 FeatureProcessorOptions options;
274 options.set_num_buckets(10);
275 options.set_context_size(7);
276 options.set_max_selection_span(7);
277 options.add_chargram_orders(1);
278 options.set_tokenize_on_space(true);
279 options.set_context_dropout_probability(0.5);
280 options.set_use_variable_context_dropout(true);
281 TokenizationCodepointRange* config =
282 options.add_tokenization_codepoint_config();
283 config->set_start(32);
284 config->set_end(33);
285 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
286 FeatureProcessor feature_processor(options);
287
Matt Sharifid40f9762017-03-14 21:24:23 +0100288 // Test that two subsequent runs with random context dropout produce
289 // different features.
290 feature_processor.SetRandom(new std::mt19937);
291
292 std::vector<std::vector<std::pair<int, float>>> features;
293 std::vector<std::vector<std::pair<int, float>>> features2;
294 std::vector<float> extra_features;
295 std::vector<CodepointSpan> selection_label_spans;
296 int selection_label;
297 CodepointSpan selection_codepoint_label;
298 int classification_label;
299 EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifibe876dc2017-03-17 17:02:43 +0100300 "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
301 &features, &extra_features, &selection_label_spans, &selection_label,
302 &selection_codepoint_label, &classification_label));
Matt Sharifid40f9762017-03-14 21:24:23 +0100303 EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifibe876dc2017-03-17 17:02:43 +0100304 "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
305 &features2, &extra_features, &selection_label_spans, &selection_label,
306 &selection_codepoint_label, &classification_label));
Matt Sharifid40f9762017-03-14 21:24:23 +0100307
308 EXPECT_NE(features, features2);
309}
310
311TEST(FeatureProcessorTest, GetFeaturesWithLongerContext) {
312 FeatureProcessorOptions options;
313 options.set_num_buckets(10);
314 options.set_context_size(9);
315 options.set_max_selection_span(7);
316 options.add_chargram_orders(1);
317 options.set_tokenize_on_space(true);
318 TokenizationCodepointRange* config =
319 options.add_tokenization_codepoint_config();
320 config->set_start(32);
321 config->set_end(33);
322 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
323 FeatureProcessor feature_processor(options);
324
Matt Sharifid40f9762017-03-14 21:24:23 +0100325 std::vector<std::vector<std::pair<int, float>>> features;
326 std::vector<float> extra_features;
327 std::vector<CodepointSpan> selection_label_spans;
328 int selection_label;
329 CodepointSpan selection_codepoint_label;
330 int classification_label;
331 EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifibe876dc2017-03-17 17:02:43 +0100332 "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
333 &features, &extra_features, &selection_label_spans, &selection_label,
334 &selection_codepoint_label, &classification_label));
Matt Sharifid40f9762017-03-14 21:24:23 +0100335 EXPECT_EQ(19, features.size());
336
337 // Should pad the string.
Matt Sharifid40f9762017-03-14 21:24:23 +0100338 EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
Matt Sharifibe876dc2017-03-17 17:02:43 +0100339 "X", {0, 1}, {0, 1}, "", &features, &extra_features,
Matt Sharifid40f9762017-03-14 21:24:23 +0100340 &selection_label_spans, &selection_label, &selection_codepoint_label,
341 &classification_label));
342 EXPECT_EQ(19, features.size());
343}
344
Matt Sharifibe876dc2017-03-17 17:02:43 +0100345TEST(FeatureProcessorTest, CenterTokenFromClick) {
346 int token_index;
347
348 // Exactly aligned indices.
349 token_index = internal::CenterTokenFromClick(
350 {6, 11}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
351 Token("heře!", 12, 17, false)});
352 EXPECT_EQ(token_index, 1);
353
354 // Click is contained in a token.
355 token_index = internal::CenterTokenFromClick(
356 {13, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
357 Token("heře!", 12, 17, false)});
358 EXPECT_EQ(token_index, 2);
359
360 // Click spans two tokens.
361 token_index = internal::CenterTokenFromClick(
362 {6, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
363 Token("heře!", 12, 17, false)});
364 EXPECT_EQ(token_index, kInvalidIndex);
365}
366
367TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100368 int token_index;
369
370 // Selection of length 3. Exactly aligned indices.
371 token_index = internal::CenterTokenFromMiddleOfSelection(
372 {7, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
373 Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
374 Token("Token5", 28, 34, false)});
375 EXPECT_EQ(token_index, 2);
376
377 // Selection of length 1 token. Exactly aligned indices.
378 token_index = internal::CenterTokenFromMiddleOfSelection(
379 {21, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
380 Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
381 Token("Token5", 28, 34, false)});
382 EXPECT_EQ(token_index, 3);
383
384 // Selection marks sub-token range, with no tokens in it.
385 token_index = internal::CenterTokenFromMiddleOfSelection(
386 {29, 33}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
387 Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
388 Token("Token5", 28, 34, false)});
389 EXPECT_EQ(token_index, kInvalidIndex);
390
391 // Selection of length 2. Sub-token indices.
392 token_index = internal::CenterTokenFromMiddleOfSelection(
393 {3, 25}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
394 Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
395 Token("Token5", 28, 34, false)});
396 EXPECT_EQ(token_index, 1);
397
398 // Selection of length 1. Sub-token indices.
399 token_index = internal::CenterTokenFromMiddleOfSelection(
400 {22, 34}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
401 Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
402 Token("Token5", 28, 34, false)});
403 EXPECT_EQ(token_index, 4);
Alex Salcianu9087f1f2017-03-22 21:22:39 -0400404
405 // Some invalid ones.
406 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
407 EXPECT_EQ(token_index, -1);
408}
409
410TEST(FeatureProcessorTest, GetFeaturesForSharing) {
411 FeatureProcessorOptions options;
412 options.set_num_buckets(10);
413 options.set_context_size(9);
414 options.set_max_selection_span(7);
415 options.add_chargram_orders(1);
416 options.set_tokenize_on_space(true);
417 options.set_center_token_selection_method(
418 FeatureProcessorOptions::CENTER_TOKEN_MIDDLE_OF_SELECTION);
419 options.set_only_use_line_with_click(true);
420 options.set_split_tokens_on_selection_boundaries(true);
421 options.set_extract_selection_mask_feature(true);
422 TokenizationCodepointRange* config =
423 options.add_tokenization_codepoint_config();
424 config->set_start(32);
425 config->set_end(33);
426 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
427 config = options.add_tokenization_codepoint_config();
428 config->set_start(10);
429 config->set_end(11);
430 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
431 FeatureProcessor feature_processor(options);
432
433 std::vector<std::vector<std::pair<int, float>>> features;
434 std::vector<float> extra_features;
435 std::vector<CodepointSpan> selection_label_spans;
436 int selection_label;
437 CodepointSpan selection_codepoint_label;
438 int classification_label;
439 EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
440 "line 1\nline2\nsome entity\n line 4", {13, 24}, {13, 24}, "", &features,
441 &extra_features, &selection_label_spans, &selection_label,
442 &selection_codepoint_label, &classification_label));
443 EXPECT_EQ(19, features.size());
Matt Sharifibe876dc2017-03-17 17:02:43 +0100444}
445
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200446TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
447 FeatureProcessorOptions options;
448 options.set_context_size(2);
449 options.set_max_selection_span(2);
450 options.set_snap_label_span_boundaries_to_containing_tokens(false);
451
452 TokenizationCodepointRange* config =
453 options.add_tokenization_codepoint_config();
454 config->set_start(32);
455 config->set_end(33);
456 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
457
458 FeatureProcessorOptions::CodepointRange* range;
459 range = options.add_supported_codepoint_ranges();
460 range->set_start(0);
461 range->set_end(128);
462
463 range = options.add_supported_codepoint_ranges();
464 range->set_start(10000);
465 range->set_end(10001);
466
467 range = options.add_supported_codepoint_ranges();
468 range->set_start(20000);
469 range->set_end(30000);
470
471 TestingFeatureProcessor feature_processor(options);
472 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
473 1, feature_processor.Tokenize("aaa bbb ccc")),
474 FloatEq(1.0));
475 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
476 1, feature_processor.Tokenize("aaa bbb ěěě")),
477 FloatEq(2.0 / 3));
478 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
479 1, feature_processor.Tokenize("ěěě řřř ěěě")),
480 FloatEq(0.0));
481 EXPECT_FALSE(feature_processor.IsCodepointSupported(-1));
482 EXPECT_TRUE(feature_processor.IsCodepointSupported(0));
483 EXPECT_TRUE(feature_processor.IsCodepointSupported(10));
484 EXPECT_TRUE(feature_processor.IsCodepointSupported(127));
485 EXPECT_FALSE(feature_processor.IsCodepointSupported(128));
486 EXPECT_FALSE(feature_processor.IsCodepointSupported(9999));
487 EXPECT_TRUE(feature_processor.IsCodepointSupported(10000));
488 EXPECT_FALSE(feature_processor.IsCodepointSupported(10001));
489 EXPECT_TRUE(feature_processor.IsCodepointSupported(25000));
490
491 std::vector<nlp_core::FeatureVector> features;
492 std::vector<float> extra_features;
493
494 options.set_min_supported_codepoint_ratio(0.0);
495 feature_processor = TestingFeatureProcessor(options);
496 EXPECT_TRUE(feature_processor.GetFeatures("ěěě řřř eee", {4, 7}, &features,
497 &extra_features,
498 /*selection_label_spans=*/nullptr));
499
500 options.set_min_supported_codepoint_ratio(0.2);
501 feature_processor = TestingFeatureProcessor(options);
502 EXPECT_TRUE(feature_processor.GetFeatures("ěěě řřř eee", {4, 7}, &features,
503 &extra_features,
504 /*selection_label_spans=*/nullptr));
505
506 options.set_min_supported_codepoint_ratio(0.5);
507 feature_processor = TestingFeatureProcessor(options);
508 EXPECT_FALSE(feature_processor.GetFeatures(
509 "ěěě řřř eee", {4, 7}, &features, &extra_features,
510 /*selection_label_spans=*/nullptr));
511}
512
Matt Sharifid40f9762017-03-14 21:24:23 +0100513} // namespace
514} // namespace libtextclassifier