blob: e3a39e37793edb5647c4094c3f869f7fe84f52b2 [file] [log] [blame]
Matt Sharifid40f9762017-03-14 21:24:23 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "smartselect/feature-processor.h"
18
19#include "gmock/gmock.h"
20#include "gtest/gtest.h"
21
22namespace libtextclassifier {
23namespace {
24
25using testing::ElementsAreArray;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +020026using testing::FloatEq;
Matt Sharifid40f9762017-03-14 21:24:23 +010027
28TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020029 std::vector<Token> tokens{Token("Hělló", 0, 5),
30 Token("fěěbař@google.com", 6, 23),
31 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010032
33 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
34
35 // clang-format off
36 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020037 {Token("Hělló", 0, 5),
38 Token("fěě", 6, 9),
39 Token("bař", 9, 12),
40 Token("@google.com", 12, 23),
41 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010042 // clang-format on
43}
44
45TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020046 std::vector<Token> tokens{Token("Hělló", 0, 5),
47 Token("fěěbař@google.com", 6, 23),
48 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010049
50 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
51
52 // clang-format off
53 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020054 {Token("Hělló", 0, 5),
55 Token("fěěbař", 6, 12),
56 Token("@google.com", 12, 23),
57 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010058 // clang-format on
59}
60
61TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020062 std::vector<Token> tokens{Token("Hělló", 0, 5),
63 Token("fěěbař@google.com", 6, 23),
64 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010065
66 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
67
68 // clang-format off
69 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020070 {Token("Hělló", 0, 5),
71 Token("fěě", 6, 9),
72 Token("bař@google.com", 9, 23),
73 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010074 // clang-format on
75}
76
77TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020078 std::vector<Token> tokens{Token("Hělló", 0, 5),
79 Token("fěěbař@google.com", 6, 23),
80 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010081
82 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
83
84 // clang-format off
85 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020086 {Token("Hělló", 0, 5),
87 Token("fěěbař@google.com", 6, 23),
88 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010089 // clang-format on
90}
91
92TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020093 std::vector<Token> tokens{Token("Hělló", 0, 5),
94 Token("fěěbař@google.com", 6, 23),
95 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010096
97 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
98
99 // clang-format off
100 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200101 {Token("Hě", 0, 2),
102 Token("lló", 2, 5),
103 Token("fěě", 6, 9),
104 Token("bař@google.com", 9, 23),
105 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100106 // clang-format on
107}
108
109TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100110 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
111 const CodepointSpan span = {0, 5};
112 // clang-format off
113 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
114 Token("Lině", 6, 10),
115 Token("Sěcond", 11, 17),
116 Token("Lině", 18, 22),
117 Token("Thiřd", 23, 28),
118 Token("Lině", 29, 33)};
119 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100120
121 // Keeps the first line.
Matt Sharifibe876dc2017-03-17 17:02:43 +0100122 internal::StripTokensFromOtherLines(context, span, &tokens);
123 EXPECT_THAT(tokens,
124 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100125}
126
127TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100128 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
129 const CodepointSpan span = {18, 22};
130 // clang-format off
131 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
132 Token("Lině", 6, 10),
133 Token("Sěcond", 11, 17),
134 Token("Lině", 18, 22),
135 Token("Thiřd", 23, 28),
136 Token("Lině", 29, 33)};
137 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100138
Matt Sharifibe876dc2017-03-17 17:02:43 +0100139 // Keeps the first line.
140 internal::StripTokensFromOtherLines(context, span, &tokens);
141 EXPECT_THAT(tokens, ElementsAreArray(
142 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100143}
144
145TEST(FeatureProcessorTest, KeepLineWithClickThird) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100146 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
147 const CodepointSpan span = {24, 33};
148 // clang-format off
149 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
150 Token("Lině", 6, 10),
151 Token("Sěcond", 11, 17),
152 Token("Lině", 18, 22),
153 Token("Thiřd", 23, 28),
154 Token("Lině", 29, 33)};
155 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100156
Matt Sharifibe876dc2017-03-17 17:02:43 +0100157 // Keeps the first line.
158 internal::StripTokensFromOtherLines(context, span, &tokens);
159 EXPECT_THAT(tokens, ElementsAreArray(
160 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100161}
162
163TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100164 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
165 const CodepointSpan span = {18, 22};
166 // clang-format off
167 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
168 Token("Lině", 6, 10),
169 Token("Sěcond", 11, 17),
170 Token("Lině", 18, 22),
171 Token("Thiřd", 23, 28),
172 Token("Lině", 29, 33)};
173 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100174
Matt Sharifibe876dc2017-03-17 17:02:43 +0100175 // Keeps the first line.
176 internal::StripTokensFromOtherLines(context, span, &tokens);
177 EXPECT_THAT(tokens, ElementsAreArray(
178 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100179}
180
181TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100182 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
183 const CodepointSpan span = {5, 23};
184 // clang-format off
185 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
186 Token("Lině", 6, 10),
187 Token("Sěcond", 18, 23),
188 Token("Lině", 19, 23),
189 Token("Thiřd", 23, 28),
190 Token("Lině", 29, 33)};
191 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100192
Matt Sharifibe876dc2017-03-17 17:02:43 +0100193 // Keeps the first line.
194 internal::StripTokensFromOtherLines(context, span, &tokens);
195 EXPECT_THAT(tokens, ElementsAreArray(
196 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
197 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
198 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100199}
200
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200201class TestingFeatureProcessor : public FeatureProcessor {
202 public:
203 using FeatureProcessor::FeatureProcessor;
204 using FeatureProcessor::SpanToLabel;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200205 using FeatureProcessor::SupportedCodepointsRatio;
206 using FeatureProcessor::IsCodepointSupported;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200207};
208
209TEST(FeatureProcessorTest, SpanToLabel) {
210 FeatureProcessorOptions options;
211 options.set_context_size(1);
212 options.set_max_selection_span(1);
213 options.set_tokenize_on_space(true);
214 options.set_snap_label_span_boundaries_to_containing_tokens(false);
215
216 TokenizationCodepointRange* config =
217 options.add_tokenization_codepoint_config();
218 config->set_start(32);
219 config->set_end(33);
220 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
221
222 TestingFeatureProcessor feature_processor(options);
223 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
224 ASSERT_EQ(3, tokens.size());
225 int label;
226 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
227 EXPECT_EQ(kInvalidLabel, label);
228 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
229 EXPECT_NE(kInvalidLabel, label);
230 TokenSpan token_span;
231 feature_processor.LabelToTokenSpan(label, &token_span);
232 EXPECT_EQ(0, token_span.first);
233 EXPECT_EQ(0, token_span.second);
234
235 // Reconfigure with snapping enabled.
236 options.set_snap_label_span_boundaries_to_containing_tokens(true);
237 TestingFeatureProcessor feature_processor2(options);
238 int label2;
239 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
240 EXPECT_EQ(label, label2);
241 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
242 EXPECT_EQ(label, label2);
243 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
244 EXPECT_EQ(label, label2);
245
246 // Cross a token boundary.
247 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
248 EXPECT_EQ(kInvalidLabel, label2);
249 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
250 EXPECT_EQ(kInvalidLabel, label2);
251
252 // Multiple tokens.
253 options.set_context_size(2);
254 options.set_max_selection_span(2);
255 TestingFeatureProcessor feature_processor3(options);
256 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
257 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
258 EXPECT_NE(kInvalidLabel, label2);
259 feature_processor3.LabelToTokenSpan(label2, &token_span);
260 EXPECT_EQ(1, token_span.first);
261 EXPECT_EQ(0, token_span.second);
262
263 int label3;
264 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
265 EXPECT_EQ(label2, label3);
266 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
267 EXPECT_EQ(label2, label3);
268 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
269 EXPECT_EQ(label2, label3);
270}
271
Matt Sharifibe876dc2017-03-17 17:02:43 +0100272TEST(FeatureProcessorTest, CenterTokenFromClick) {
273 int token_index;
274
275 // Exactly aligned indices.
276 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200277 {6, 11},
278 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100279 EXPECT_EQ(token_index, 1);
280
281 // Click is contained in a token.
282 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200283 {13, 17},
284 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100285 EXPECT_EQ(token_index, 2);
286
287 // Click spans two tokens.
288 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200289 {6, 17},
290 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100291 EXPECT_EQ(token_index, kInvalidIndex);
292}
293
294TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100295 int token_index;
296
297 // Selection of length 3. Exactly aligned indices.
298 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200299 {7, 27},
300 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
301 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100302 EXPECT_EQ(token_index, 2);
303
304 // Selection of length 1 token. Exactly aligned indices.
305 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200306 {21, 27},
307 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
308 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100309 EXPECT_EQ(token_index, 3);
310
311 // Selection marks sub-token range, with no tokens in it.
312 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200313 {29, 33},
314 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
315 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100316 EXPECT_EQ(token_index, kInvalidIndex);
317
318 // Selection of length 2. Sub-token indices.
319 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200320 {3, 25},
321 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
322 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100323 EXPECT_EQ(token_index, 1);
324
325 // Selection of length 1. Sub-token indices.
326 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200327 {22, 34},
328 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
329 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100330 EXPECT_EQ(token_index, 4);
Alex Salcianu9087f1f2017-03-22 21:22:39 -0400331
332 // Some invalid ones.
333 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
334 EXPECT_EQ(token_index, -1);
335}
336
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200337TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
338 FeatureProcessorOptions options;
339 options.set_context_size(2);
340 options.set_max_selection_span(2);
341 options.set_snap_label_span_boundaries_to_containing_tokens(false);
342
343 TokenizationCodepointRange* config =
344 options.add_tokenization_codepoint_config();
345 config->set_start(32);
346 config->set_end(33);
347 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
348
349 FeatureProcessorOptions::CodepointRange* range;
350 range = options.add_supported_codepoint_ranges();
351 range->set_start(0);
352 range->set_end(128);
353
354 range = options.add_supported_codepoint_ranges();
355 range->set_start(10000);
356 range->set_end(10001);
357
358 range = options.add_supported_codepoint_ranges();
359 range->set_start(20000);
360 range->set_end(30000);
361
362 TestingFeatureProcessor feature_processor(options);
363 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
364 1, feature_processor.Tokenize("aaa bbb ccc")),
365 FloatEq(1.0));
366 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
367 1, feature_processor.Tokenize("aaa bbb ěěě")),
368 FloatEq(2.0 / 3));
369 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
370 1, feature_processor.Tokenize("ěěě řřř ěěě")),
371 FloatEq(0.0));
372 EXPECT_FALSE(feature_processor.IsCodepointSupported(-1));
373 EXPECT_TRUE(feature_processor.IsCodepointSupported(0));
374 EXPECT_TRUE(feature_processor.IsCodepointSupported(10));
375 EXPECT_TRUE(feature_processor.IsCodepointSupported(127));
376 EXPECT_FALSE(feature_processor.IsCodepointSupported(128));
377 EXPECT_FALSE(feature_processor.IsCodepointSupported(9999));
378 EXPECT_TRUE(feature_processor.IsCodepointSupported(10000));
379 EXPECT_FALSE(feature_processor.IsCodepointSupported(10001));
380 EXPECT_TRUE(feature_processor.IsCodepointSupported(25000));
381
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200382 std::vector<Token> tokens;
383 int click_pos;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200384 std::vector<float> extra_features;
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200385 std::unique_ptr<CachedFeatures> cached_features;
386
387 auto feature_fn = [](const std::vector<int>& sparse_features,
388 const std::vector<float>& dense_features,
389 float* embedding) { return true; };
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200390
391 options.set_min_supported_codepoint_ratio(0.0);
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200392 TestingFeatureProcessor feature_processor2(options);
393 EXPECT_TRUE(feature_processor2.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
394 feature_fn, 2, &tokens,
395 &click_pos, &cached_features));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200396
397 options.set_min_supported_codepoint_ratio(0.2);
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200398 TestingFeatureProcessor feature_processor3(options);
399 EXPECT_TRUE(feature_processor3.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
400 feature_fn, 2, &tokens,
401 &click_pos, &cached_features));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200402
403 options.set_min_supported_codepoint_ratio(0.5);
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200404 TestingFeatureProcessor feature_processor4(options);
405 EXPECT_FALSE(feature_processor4.ExtractFeatures(
406 "ěěě řřř eee", {4, 7}, {0, 0}, feature_fn, 2, &tokens, &click_pos,
407 &cached_features));
408}
409
410TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
411 std::vector<Token> tokens_orig{
412 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
413 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
414 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
415 Token("12", 0, 0)};
416
417 std::vector<Token> tokens;
418 int click_index;
419
420 // Try to click first token and see if it gets padded from left.
421 tokens = tokens_orig;
422 click_index = 0;
423 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
424 // clang-format off
425 EXPECT_EQ(tokens, std::vector<Token>({Token(),
426 Token(),
427 Token("0", 0, 0),
428 Token("1", 0, 0),
429 Token("2", 0, 0)}));
430 // clang-format on
431 EXPECT_EQ(click_index, 2);
432
433 // When we click the second token nothing should get padded.
434 tokens = tokens_orig;
435 click_index = 2;
436 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
437 // clang-format off
438 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
439 Token("1", 0, 0),
440 Token("2", 0, 0),
441 Token("3", 0, 0),
442 Token("4", 0, 0)}));
443 // clang-format on
444 EXPECT_EQ(click_index, 2);
445
446 // When we click the last token tokens should get padded from the right.
447 tokens = tokens_orig;
448 click_index = 12;
449 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
450 // clang-format off
451 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
452 Token("11", 0, 0),
453 Token("12", 0, 0),
454 Token(),
455 Token()}));
456 // clang-format on
457 EXPECT_EQ(click_index, 2);
458}
459
460TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
461 std::vector<Token> tokens_orig{
462 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
463 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
464 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
465 Token("12", 0, 0)};
466
467 std::vector<Token> tokens;
468 int click_index;
469
470 // Try to click first token and see if it gets padded from left to maximum
471 // context_size.
472 tokens = tokens_orig;
473 click_index = 0;
474 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
475 // clang-format off
476 EXPECT_EQ(tokens, std::vector<Token>({Token(),
477 Token(),
478 Token("0", 0, 0),
479 Token("1", 0, 0),
480 Token("2", 0, 0),
481 Token("3", 0, 0),
482 Token("4", 0, 0),
483 Token("5", 0, 0)}));
484 // clang-format on
485 EXPECT_EQ(click_index, 2);
486
487 // Clicking to the middle with enough context should not produce any padding.
488 tokens = tokens_orig;
489 click_index = 6;
490 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
491 // clang-format off
492 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
493 Token("2", 0, 0),
494 Token("3", 0, 0),
495 Token("4", 0, 0),
496 Token("5", 0, 0),
497 Token("6", 0, 0),
498 Token("7", 0, 0),
499 Token("8", 0, 0),
500 Token("9", 0, 0)}));
501 // clang-format on
502 EXPECT_EQ(click_index, 5);
503
504 // Clicking at the end should pad right to maximum context_size.
505 tokens = tokens_orig;
506 click_index = 11;
507 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
508 // clang-format off
509 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
510 Token("7", 0, 0),
511 Token("8", 0, 0),
512 Token("9", 0, 0),
513 Token("10", 0, 0),
514 Token("11", 0, 0),
515 Token("12", 0, 0),
516 Token(),
517 Token()}));
518 // clang-format on
519 EXPECT_EQ(click_index, 5);
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200520}
521
Matt Sharifid40f9762017-03-14 21:24:23 +0100522} // namespace
523} // namespace libtextclassifier