blob: 58b3033b85e91e6e54ba6f243942f20c82167f0e [file] [log] [blame]
Matt Sharifid40f9762017-03-14 21:24:23 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Lukas Zilka21d8c982018-01-24 11:11:20 +010017#include "feature-processor.h"
18
19#include "model-executor.h"
20#include "tensor-view.h"
Matt Sharifid40f9762017-03-14 21:24:23 +010021
22#include "gmock/gmock.h"
23#include "gtest/gtest.h"
24
Lukas Zilka21d8c982018-01-24 11:11:20 +010025namespace libtextclassifier2 {
Matt Sharifid40f9762017-03-14 21:24:23 +010026namespace {
27
28using testing::ElementsAreArray;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +020029using testing::FloatEq;
Lukas Zilkaba849e72018-03-08 14:48:21 +010030using testing::Matcher;
Matt Sharifid40f9762017-03-14 21:24:23 +010031
Lukas Zilka21d8c982018-01-24 11:11:20 +010032flatbuffers::DetachedBuffer PackFeatureProcessorOptions(
33 const FeatureProcessorOptionsT& options) {
34 flatbuffers::FlatBufferBuilder builder;
35 builder.Finish(CreateFeatureProcessorOptions(builder, &options));
36 return builder.Release();
37}
38
Lukas Zilkaba849e72018-03-08 14:48:21 +010039template <typename T>
40std::vector<T> Subvector(const std::vector<T>& vector, int start, int end) {
41 return std::vector<T>(vector.begin() + start, vector.begin() + end);
42}
43
44Matcher<std::vector<float>> ElementsAreFloat(const std::vector<float>& values) {
45 std::vector<Matcher<float>> matchers;
46 for (const float value : values) {
47 matchers.push_back(FloatEq(value));
48 }
49 return ElementsAreArray(matchers);
50}
51
Lukas Zilka726b4d22017-12-13 16:37:03 +010052class TestingFeatureProcessor : public FeatureProcessor {
53 public:
54 using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
55 using FeatureProcessor::FeatureProcessor;
56 using FeatureProcessor::ICUTokenize;
57 using FeatureProcessor::IsCodepointInRanges;
58 using FeatureProcessor::SpanToLabel;
59 using FeatureProcessor::StripTokensFromOtherLines;
60 using FeatureProcessor::supported_codepoint_ranges_;
61 using FeatureProcessor::SupportedCodepointsRatio;
62};
63
Lukas Zilka21d8c982018-01-24 11:11:20 +010064// EmbeddingExecutor that always returns features based on
65class FakeEmbeddingExecutor : public EmbeddingExecutor {
66 public:
67 bool AddEmbedding(const TensorView<int>& sparse_features, float* dest,
Lukas Zilkaba849e72018-03-08 14:48:21 +010068 int dest_size) const override {
Lukas Zilka21d8c982018-01-24 11:11:20 +010069 TC_CHECK_GE(dest_size, 4);
70 EXPECT_EQ(sparse_features.size(), 1);
71 dest[0] = sparse_features.data()[0];
72 dest[1] = sparse_features.data()[0];
73 dest[2] = -sparse_features.data()[0];
74 dest[3] = -sparse_features.data()[0];
75 return true;
76 }
77
78 private:
79 std::vector<float> storage_;
80};
81
Matt Sharifid40f9762017-03-14 21:24:23 +010082TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020083 std::vector<Token> tokens{Token("Hělló", 0, 5),
84 Token("fěěbař@google.com", 6, 23),
85 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010086
87 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
88
89 // clang-format off
90 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020091 {Token("Hělló", 0, 5),
92 Token("fěě", 6, 9),
93 Token("bař", 9, 12),
94 Token("@google.com", 12, 23),
95 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010096 // clang-format on
97}
98
99TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200100 std::vector<Token> tokens{Token("Hělló", 0, 5),
101 Token("fěěbař@google.com", 6, 23),
102 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100103
104 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
105
106 // clang-format off
107 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200108 {Token("Hělló", 0, 5),
109 Token("fěěbař", 6, 12),
110 Token("@google.com", 12, 23),
111 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100112 // clang-format on
113}
114
115TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200116 std::vector<Token> tokens{Token("Hělló", 0, 5),
117 Token("fěěbař@google.com", 6, 23),
118 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100119
120 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
121
122 // clang-format off
123 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200124 {Token("Hělló", 0, 5),
125 Token("fěě", 6, 9),
126 Token("bař@google.com", 9, 23),
127 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100128 // clang-format on
129}
130
131TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200132 std::vector<Token> tokens{Token("Hělló", 0, 5),
133 Token("fěěbař@google.com", 6, 23),
134 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100135
136 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
137
138 // clang-format off
139 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200140 {Token("Hělló", 0, 5),
141 Token("fěěbař@google.com", 6, 23),
142 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100143 // clang-format on
144}
145
146TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200147 std::vector<Token> tokens{Token("Hělló", 0, 5),
148 Token("fěěbař@google.com", 6, 23),
149 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100150
151 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
152
153 // clang-format off
154 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200155 {Token("Hě", 0, 2),
156 Token("lló", 2, 5),
157 Token("fěě", 6, 9),
158 Token("bař@google.com", 9, 23),
159 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100160 // clang-format on
161}
162
163TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100164 CREATE_UNILIB_FOR_TESTING;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100165 FeatureProcessorOptionsT options;
166 options.only_use_line_with_click = true;
167 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
168 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100169 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
170 &unilib);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100171
Matt Sharifibe876dc2017-03-17 17:02:43 +0100172 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
173 const CodepointSpan span = {0, 5};
174 // clang-format off
175 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
176 Token("Lině", 6, 10),
177 Token("Sěcond", 11, 17),
178 Token("Lině", 18, 22),
179 Token("Thiřd", 23, 28),
180 Token("Lině", 29, 33)};
181 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100182
183 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100184 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100185 EXPECT_THAT(tokens,
186 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100187}
188
189TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100190 CREATE_UNILIB_FOR_TESTING;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100191 FeatureProcessorOptionsT options;
192 options.only_use_line_with_click = true;
193 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
194 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100195 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
196 &unilib);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100197
Matt Sharifibe876dc2017-03-17 17:02:43 +0100198 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
199 const CodepointSpan span = {18, 22};
200 // clang-format off
201 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
202 Token("Lině", 6, 10),
203 Token("Sěcond", 11, 17),
204 Token("Lině", 18, 22),
205 Token("Thiřd", 23, 28),
206 Token("Lině", 29, 33)};
207 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100208
Matt Sharifibe876dc2017-03-17 17:02:43 +0100209 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100210 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100211 EXPECT_THAT(tokens, ElementsAreArray(
212 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100213}
214
215TEST(FeatureProcessorTest, KeepLineWithClickThird) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100216 CREATE_UNILIB_FOR_TESTING;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100217 FeatureProcessorOptionsT options;
218 options.only_use_line_with_click = true;
219 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
220 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100221 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
222 &unilib);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100223
Matt Sharifibe876dc2017-03-17 17:02:43 +0100224 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
225 const CodepointSpan span = {24, 33};
226 // clang-format off
227 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
228 Token("Lině", 6, 10),
229 Token("Sěcond", 11, 17),
230 Token("Lině", 18, 22),
231 Token("Thiřd", 23, 28),
232 Token("Lině", 29, 33)};
233 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100234
Matt Sharifibe876dc2017-03-17 17:02:43 +0100235 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100236 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100237 EXPECT_THAT(tokens, ElementsAreArray(
238 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100239}
240
241TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100242 CREATE_UNILIB_FOR_TESTING;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100243 FeatureProcessorOptionsT options;
244 options.only_use_line_with_click = true;
245 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
246 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100247 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
248 &unilib);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100249
Matt Sharifibe876dc2017-03-17 17:02:43 +0100250 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
251 const CodepointSpan span = {18, 22};
252 // clang-format off
253 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
254 Token("Lině", 6, 10),
255 Token("Sěcond", 11, 17),
256 Token("Lině", 18, 22),
257 Token("Thiřd", 23, 28),
258 Token("Lině", 29, 33)};
259 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100260
Matt Sharifibe876dc2017-03-17 17:02:43 +0100261 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100262 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100263 EXPECT_THAT(tokens, ElementsAreArray(
264 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100265}
266
267TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100268 CREATE_UNILIB_FOR_TESTING;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100269 FeatureProcessorOptionsT options;
270 options.only_use_line_with_click = true;
271 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
272 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100273 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
274 &unilib);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100275
Matt Sharifibe876dc2017-03-17 17:02:43 +0100276 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
277 const CodepointSpan span = {5, 23};
278 // clang-format off
279 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
280 Token("Lině", 6, 10),
281 Token("Sěcond", 18, 23),
282 Token("Lině", 19, 23),
283 Token("Thiřd", 23, 28),
284 Token("Lině", 29, 33)};
285 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100286
Matt Sharifibe876dc2017-03-17 17:02:43 +0100287 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100288 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100289 EXPECT_THAT(tokens, ElementsAreArray(
290 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
291 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
292 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100293}
294
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200295TEST(FeatureProcessorTest, SpanToLabel) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100296 CREATE_UNILIB_FOR_TESTING;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100297 FeatureProcessorOptionsT options;
298 options.context_size = 1;
299 options.max_selection_span = 1;
300 options.snap_label_span_boundaries_to_containing_tokens = false;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200301
Lukas Zilka21d8c982018-01-24 11:11:20 +0100302 options.tokenization_codepoint_config.emplace_back(
303 new TokenizationCodepointRangeT());
304 auto& config = options.tokenization_codepoint_config.back();
305 config->start = 32;
306 config->end = 33;
307 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200308
Lukas Zilka21d8c982018-01-24 11:11:20 +0100309 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
310 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100311 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
312 &unilib);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200313 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
314 ASSERT_EQ(3, tokens.size());
315 int label;
316 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
317 EXPECT_EQ(kInvalidLabel, label);
318 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
319 EXPECT_NE(kInvalidLabel, label);
320 TokenSpan token_span;
321 feature_processor.LabelToTokenSpan(label, &token_span);
322 EXPECT_EQ(0, token_span.first);
323 EXPECT_EQ(0, token_span.second);
324
325 // Reconfigure with snapping enabled.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100326 options.snap_label_span_boundaries_to_containing_tokens = true;
327 flatbuffers::DetachedBuffer options2_fb =
328 PackFeatureProcessorOptions(options);
329 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100330 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
331 &unilib);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200332 int label2;
333 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
334 EXPECT_EQ(label, label2);
335 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
336 EXPECT_EQ(label, label2);
337 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
338 EXPECT_EQ(label, label2);
339
340 // Cross a token boundary.
341 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
342 EXPECT_EQ(kInvalidLabel, label2);
343 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
344 EXPECT_EQ(kInvalidLabel, label2);
345
346 // Multiple tokens.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100347 options.context_size = 2;
348 options.max_selection_span = 2;
349 flatbuffers::DetachedBuffer options3_fb =
350 PackFeatureProcessorOptions(options);
351 TestingFeatureProcessor feature_processor3(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100352 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
353 &unilib);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200354 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
355 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
356 EXPECT_NE(kInvalidLabel, label2);
357 feature_processor3.LabelToTokenSpan(label2, &token_span);
358 EXPECT_EQ(1, token_span.first);
359 EXPECT_EQ(0, token_span.second);
360
361 int label3;
362 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
363 EXPECT_EQ(label2, label3);
364 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
365 EXPECT_EQ(label2, label3);
366 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
367 EXPECT_EQ(label2, label3);
368}
369
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200370TEST(FeatureProcessorTest, SpanToLabelIgnoresPunctuation) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100371 CREATE_UNILIB_FOR_TESTING;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100372 FeatureProcessorOptionsT options;
373 options.context_size = 1;
374 options.max_selection_span = 1;
375 options.snap_label_span_boundaries_to_containing_tokens = false;
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200376
Lukas Zilka21d8c982018-01-24 11:11:20 +0100377 options.tokenization_codepoint_config.emplace_back(
378 new TokenizationCodepointRangeT());
379 auto& config = options.tokenization_codepoint_config.back();
380 config->start = 32;
381 config->end = 33;
382 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200383
Lukas Zilka21d8c982018-01-24 11:11:20 +0100384 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
385 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100386 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
387 &unilib);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200388 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
389 ASSERT_EQ(3, tokens.size());
390 int label;
391 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
392 EXPECT_EQ(kInvalidLabel, label);
393 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
394 EXPECT_NE(kInvalidLabel, label);
395 TokenSpan token_span;
396 feature_processor.LabelToTokenSpan(label, &token_span);
397 EXPECT_EQ(0, token_span.first);
398 EXPECT_EQ(0, token_span.second);
399
400 // Reconfigure with snapping enabled.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100401 options.snap_label_span_boundaries_to_containing_tokens = true;
402 flatbuffers::DetachedBuffer options2_fb =
403 PackFeatureProcessorOptions(options);
404 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100405 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
406 &unilib);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200407 int label2;
408 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
409 EXPECT_EQ(label, label2);
410 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
411 EXPECT_EQ(label, label2);
412 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
413 EXPECT_EQ(label, label2);
414
415 // Cross a token boundary.
416 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
417 EXPECT_EQ(kInvalidLabel, label2);
418 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
419 EXPECT_EQ(kInvalidLabel, label2);
420
421 // Multiple tokens.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100422 options.context_size = 2;
423 options.max_selection_span = 2;
424 flatbuffers::DetachedBuffer options3_fb =
425 PackFeatureProcessorOptions(options);
426 TestingFeatureProcessor feature_processor3(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100427 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
428 &unilib);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200429 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
430 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
431 EXPECT_NE(kInvalidLabel, label2);
432 feature_processor3.LabelToTokenSpan(label2, &token_span);
433 EXPECT_EQ(1, token_span.first);
434 EXPECT_EQ(0, token_span.second);
435
436 int label3;
437 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
438 EXPECT_EQ(label2, label3);
439 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
440 EXPECT_EQ(label2, label3);
441 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
442 EXPECT_EQ(label2, label3);
443}
444
Matt Sharifibe876dc2017-03-17 17:02:43 +0100445TEST(FeatureProcessorTest, CenterTokenFromClick) {
446 int token_index;
447
448 // Exactly aligned indices.
449 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200450 {6, 11},
451 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100452 EXPECT_EQ(token_index, 1);
453
454 // Click is contained in a token.
455 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200456 {13, 17},
457 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100458 EXPECT_EQ(token_index, 2);
459
460 // Click spans two tokens.
461 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200462 {6, 17},
463 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100464 EXPECT_EQ(token_index, kInvalidIndex);
465}
466
467TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100468 int token_index;
469
470 // Selection of length 3. Exactly aligned indices.
471 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200472 {7, 27},
473 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
474 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100475 EXPECT_EQ(token_index, 2);
476
477 // Selection of length 1 token. Exactly aligned indices.
478 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200479 {21, 27},
480 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
481 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100482 EXPECT_EQ(token_index, 3);
483
484 // Selection marks sub-token range, with no tokens in it.
485 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200486 {29, 33},
487 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
488 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100489 EXPECT_EQ(token_index, kInvalidIndex);
490
491 // Selection of length 2. Sub-token indices.
492 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200493 {3, 25},
494 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
495 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100496 EXPECT_EQ(token_index, 1);
497
498 // Selection of length 1. Sub-token indices.
499 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200500 {22, 34},
501 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
502 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100503 EXPECT_EQ(token_index, 4);
Alex Salcianu9087f1f2017-03-22 21:22:39 -0400504
505 // Some invalid ones.
506 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
507 EXPECT_EQ(token_index, -1);
508}
509
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200510TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100511 FeatureProcessorOptionsT options;
512 options.context_size = 2;
513 options.max_selection_span = 2;
514 options.snap_label_span_boundaries_to_containing_tokens = false;
515 options.feature_version = 2;
516 options.embedding_size = 4;
517 options.bounds_sensitive_features.reset(
518 new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
519 options.bounds_sensitive_features->enabled = true;
520 options.bounds_sensitive_features->num_tokens_before = 5;
521 options.bounds_sensitive_features->num_tokens_inside_left = 3;
522 options.bounds_sensitive_features->num_tokens_inside_right = 3;
523 options.bounds_sensitive_features->num_tokens_after = 5;
524 options.bounds_sensitive_features->include_inside_bag = true;
525 options.bounds_sensitive_features->include_inside_length = true;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200526
Lukas Zilka21d8c982018-01-24 11:11:20 +0100527 options.tokenization_codepoint_config.emplace_back(
528 new TokenizationCodepointRangeT());
529 auto& config = options.tokenization_codepoint_config.back();
530 config->start = 32;
531 config->end = 33;
532 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200533
Lukas Zilka21d8c982018-01-24 11:11:20 +0100534 {
535 options.supported_codepoint_ranges.emplace_back(
536 new FeatureProcessorOptions_::CodepointRangeT());
537 auto& range = options.supported_codepoint_ranges.back();
538 range->start = 0;
539 range->end = 128;
540 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200541
Lukas Zilka21d8c982018-01-24 11:11:20 +0100542 {
543 options.supported_codepoint_ranges.emplace_back(
544 new FeatureProcessorOptions_::CodepointRangeT());
545 auto& range = options.supported_codepoint_ranges.back();
546 range->start = 10000;
547 range->end = 10001;
548 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200549
Lukas Zilka21d8c982018-01-24 11:11:20 +0100550 {
551 options.supported_codepoint_ranges.emplace_back(
552 new FeatureProcessorOptions_::CodepointRangeT());
553 auto& range = options.supported_codepoint_ranges.back();
554 range->start = 20000;
555 range->end = 30000;
556 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200557
Lukas Zilka21d8c982018-01-24 11:11:20 +0100558 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100559 CREATE_UNILIB_FOR_TESTING;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100560 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100561 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
562 &unilib);
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200563 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100564 {0, 3}, feature_processor.Tokenize("aaa bbb ccc")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200565 FloatEq(1.0));
566 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100567 {0, 3}, feature_processor.Tokenize("aaa bbb ěěě")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200568 FloatEq(2.0 / 3));
569 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100570 {0, 3}, feature_processor.Tokenize("ěěě řřř ěěě")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200571 FloatEq(0.0));
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200572 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
573 -1, feature_processor.supported_codepoint_ranges_));
574 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
575 0, feature_processor.supported_codepoint_ranges_));
576 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
577 10, feature_processor.supported_codepoint_ranges_));
578 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
579 127, feature_processor.supported_codepoint_ranges_));
580 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
581 128, feature_processor.supported_codepoint_ranges_));
582 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
583 9999, feature_processor.supported_codepoint_ranges_));
584 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
585 10000, feature_processor.supported_codepoint_ranges_));
586 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
587 10001, feature_processor.supported_codepoint_ranges_));
588 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
589 25000, feature_processor.supported_codepoint_ranges_));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200590
Lukas Zilka21d8c982018-01-24 11:11:20 +0100591 const std::vector<Token> tokens = {Token("ěěě", 0, 3), Token("řřř", 4, 7),
592 Token("eee", 8, 11)};
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200593
Lukas Zilka21d8c982018-01-24 11:11:20 +0100594 options.min_supported_codepoint_ratio = 0.0;
595 flatbuffers::DetachedBuffer options2_fb =
596 PackFeatureProcessorOptions(options);
597 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100598 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
599 &unilib);
Lukas Zilka434442d2018-04-25 11:38:51 +0200600 EXPECT_TRUE(feature_processor2.HasEnoughSupportedCodepoints(
601 tokens, /*token_span=*/{0, 3}));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200602
Lukas Zilka21d8c982018-01-24 11:11:20 +0100603 options.min_supported_codepoint_ratio = 0.2;
604 flatbuffers::DetachedBuffer options3_fb =
605 PackFeatureProcessorOptions(options);
606 TestingFeatureProcessor feature_processor3(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100607 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
608 &unilib);
Lukas Zilka434442d2018-04-25 11:38:51 +0200609 EXPECT_TRUE(feature_processor3.HasEnoughSupportedCodepoints(
610 tokens, /*token_span=*/{0, 3}));
Lukas Zilka21d8c982018-01-24 11:11:20 +0100611
612 options.min_supported_codepoint_ratio = 0.5;
613 flatbuffers::DetachedBuffer options4_fb =
614 PackFeatureProcessorOptions(options);
615 TestingFeatureProcessor feature_processor4(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100616 flatbuffers::GetRoot<FeatureProcessorOptions>(options4_fb.data()),
617 &unilib);
Lukas Zilka434442d2018-04-25 11:38:51 +0200618 EXPECT_FALSE(feature_processor4.HasEnoughSupportedCodepoints(
619 tokens, /*token_span=*/{0, 3}));
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200620}
621
Lukas Zilkab23e2122018-02-09 10:25:19 +0100622TEST(FeatureProcessorTest, InSpanFeature) {
623 FeatureProcessorOptionsT options;
624 options.context_size = 2;
625 options.max_selection_span = 2;
626 options.snap_label_span_boundaries_to_containing_tokens = false;
627 options.feature_version = 2;
628 options.embedding_size = 4;
629 options.extract_selection_mask_feature = true;
630
631 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100632 CREATE_UNILIB_FOR_TESTING;
Lukas Zilkab23e2122018-02-09 10:25:19 +0100633 TestingFeatureProcessor feature_processor(
634 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
635 &unilib);
636
637 std::unique_ptr<CachedFeatures> cached_features;
638
639 FakeEmbeddingExecutor embedding_executor;
640
641 const std::vector<Token> tokens = {Token("aaa", 0, 3), Token("bbb", 4, 7),
642 Token("ccc", 8, 11), Token("ddd", 12, 15)};
643
644 EXPECT_TRUE(feature_processor.ExtractFeatures(
645 tokens, /*token_span=*/{0, 4},
646 /*selection_span_for_feature=*/{4, 11}, &embedding_executor,
Lukas Zilkaba849e72018-03-08 14:48:21 +0100647 /*embedding_cache=*/nullptr, /*feature_vector_size=*/5,
648 &cached_features));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100649 std::vector<float> features;
650 cached_features->AppendClickContextFeaturesForClick(1, &features);
651 ASSERT_EQ(features.size(), 25);
652 EXPECT_THAT(features[4], FloatEq(0.0));
653 EXPECT_THAT(features[9], FloatEq(0.0));
654 EXPECT_THAT(features[14], FloatEq(1.0));
655 EXPECT_THAT(features[19], FloatEq(1.0));
656 EXPECT_THAT(features[24], FloatEq(0.0));
657}
658
Lukas Zilkaba849e72018-03-08 14:48:21 +0100659TEST(FeatureProcessorTest, EmbeddingCache) {
660 FeatureProcessorOptionsT options;
661 options.context_size = 2;
662 options.max_selection_span = 2;
663 options.snap_label_span_boundaries_to_containing_tokens = false;
664 options.feature_version = 2;
665 options.embedding_size = 4;
666 options.bounds_sensitive_features.reset(
667 new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
668 options.bounds_sensitive_features->enabled = true;
669 options.bounds_sensitive_features->num_tokens_before = 3;
670 options.bounds_sensitive_features->num_tokens_inside_left = 2;
671 options.bounds_sensitive_features->num_tokens_inside_right = 2;
672 options.bounds_sensitive_features->num_tokens_after = 3;
673
674 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
675 CREATE_UNILIB_FOR_TESTING;
676 TestingFeatureProcessor feature_processor(
677 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
678 &unilib);
679
680 std::unique_ptr<CachedFeatures> cached_features;
681
682 FakeEmbeddingExecutor embedding_executor;
683
684 const std::vector<Token> tokens = {
685 Token("aaa", 0, 3), Token("bbb", 4, 7), Token("ccc", 8, 11),
686 Token("ddd", 12, 15), Token("eee", 16, 19), Token("fff", 20, 23)};
687
688 // We pre-populate the cache with dummy embeddings, to make sure they are
689 // used when populating the features vector.
690 const std::vector<float> cached_padding_features = {10.0, -10.0, 10.0, -10.0};
691 const std::vector<float> cached_features1 = {1.0, 2.0, 3.0, 4.0};
692 const std::vector<float> cached_features2 = {5.0, 6.0, 7.0, 8.0};
693 FeatureProcessor::EmbeddingCache embedding_cache = {
694 {{kInvalidIndex, kInvalidIndex}, cached_padding_features},
695 {{4, 7}, cached_features1},
696 {{12, 15}, cached_features2},
697 };
698
699 EXPECT_TRUE(feature_processor.ExtractFeatures(
700 tokens, /*token_span=*/{0, 6},
701 /*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
702 &embedding_executor, &embedding_cache, /*feature_vector_size=*/4,
703 &cached_features));
704 std::vector<float> features;
705 cached_features->AppendBoundsSensitiveFeaturesForSpan({2, 4}, &features);
706 ASSERT_EQ(features.size(), 40);
707 // Check that the dummy embeddings were used.
708 EXPECT_THAT(Subvector(features, 0, 4),
709 ElementsAreFloat(cached_padding_features));
710 EXPECT_THAT(Subvector(features, 8, 12), ElementsAreFloat(cached_features1));
711 EXPECT_THAT(Subvector(features, 16, 20), ElementsAreFloat(cached_features2));
712 EXPECT_THAT(Subvector(features, 24, 28), ElementsAreFloat(cached_features2));
713 EXPECT_THAT(Subvector(features, 36, 40),
714 ElementsAreFloat(cached_padding_features));
715 // Check that the real embeddings were cached.
716 EXPECT_EQ(embedding_cache.size(), 7);
717 EXPECT_THAT(Subvector(features, 4, 8),
718 ElementsAreFloat(embedding_cache.at({0, 3})));
719 EXPECT_THAT(Subvector(features, 12, 16),
720 ElementsAreFloat(embedding_cache.at({8, 11})));
721 EXPECT_THAT(Subvector(features, 20, 24),
722 ElementsAreFloat(embedding_cache.at({8, 11})));
723 EXPECT_THAT(Subvector(features, 28, 32),
724 ElementsAreFloat(embedding_cache.at({16, 19})));
725 EXPECT_THAT(Subvector(features, 32, 36),
726 ElementsAreFloat(embedding_cache.at({20, 23})));
727}
728
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200729TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
730 std::vector<Token> tokens_orig{
731 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
732 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
733 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
734 Token("12", 0, 0)};
735
736 std::vector<Token> tokens;
737 int click_index;
738
739 // Try to click first token and see if it gets padded from left.
740 tokens = tokens_orig;
741 click_index = 0;
742 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
743 // clang-format off
744 EXPECT_EQ(tokens, std::vector<Token>({Token(),
745 Token(),
746 Token("0", 0, 0),
747 Token("1", 0, 0),
748 Token("2", 0, 0)}));
749 // clang-format on
750 EXPECT_EQ(click_index, 2);
751
752 // When we click the second token nothing should get padded.
753 tokens = tokens_orig;
754 click_index = 2;
755 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
756 // clang-format off
757 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
758 Token("1", 0, 0),
759 Token("2", 0, 0),
760 Token("3", 0, 0),
761 Token("4", 0, 0)}));
762 // clang-format on
763 EXPECT_EQ(click_index, 2);
764
765 // When we click the last token tokens should get padded from the right.
766 tokens = tokens_orig;
767 click_index = 12;
768 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
769 // clang-format off
770 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
771 Token("11", 0, 0),
772 Token("12", 0, 0),
773 Token(),
774 Token()}));
775 // clang-format on
776 EXPECT_EQ(click_index, 2);
777}
778
779TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
780 std::vector<Token> tokens_orig{
781 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
782 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
783 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
784 Token("12", 0, 0)};
785
786 std::vector<Token> tokens;
787 int click_index;
788
789 // Try to click first token and see if it gets padded from left to maximum
790 // context_size.
791 tokens = tokens_orig;
792 click_index = 0;
793 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
794 // clang-format off
795 EXPECT_EQ(tokens, std::vector<Token>({Token(),
796 Token(),
797 Token("0", 0, 0),
798 Token("1", 0, 0),
799 Token("2", 0, 0),
800 Token("3", 0, 0),
801 Token("4", 0, 0),
802 Token("5", 0, 0)}));
803 // clang-format on
804 EXPECT_EQ(click_index, 2);
805
806 // Clicking to the middle with enough context should not produce any padding.
807 tokens = tokens_orig;
808 click_index = 6;
809 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
810 // clang-format off
811 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
812 Token("2", 0, 0),
813 Token("3", 0, 0),
814 Token("4", 0, 0),
815 Token("5", 0, 0),
816 Token("6", 0, 0),
817 Token("7", 0, 0),
818 Token("8", 0, 0),
819 Token("9", 0, 0)}));
820 // clang-format on
821 EXPECT_EQ(click_index, 5);
822
823 // Clicking at the end should pad right to maximum context_size.
824 tokens = tokens_orig;
825 click_index = 11;
826 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
827 // clang-format off
828 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
829 Token("7", 0, 0),
830 Token("8", 0, 0),
831 Token("9", 0, 0),
832 Token("10", 0, 0),
833 Token("11", 0, 0),
834 Token("12", 0, 0),
835 Token(),
836 Token()}));
837 // clang-format on
838 EXPECT_EQ(click_index, 5);
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200839}
840
Lukas Zilka21d8c982018-01-24 11:11:20 +0100841TEST(FeatureProcessorTest, InternalTokenizeOnScriptChange) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100842 CREATE_UNILIB_FOR_TESTING;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100843 FeatureProcessorOptionsT options;
844 options.tokenization_codepoint_config.emplace_back(
845 new TokenizationCodepointRangeT());
846 {
847 auto& config = options.tokenization_codepoint_config.back();
848 config->start = 0;
849 config->end = 256;
850 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
851 config->script_id = 1;
852 }
853 options.tokenize_on_script_change = false;
Lukas Zilka40c18de2017-04-10 17:22:22 +0200854
Lukas Zilka21d8c982018-01-24 11:11:20 +0100855 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
856 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100857 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
858 &unilib);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100859
860 EXPECT_EQ(feature_processor.Tokenize("앨라배마123웹사이트"),
861 std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
862
863 options.tokenize_on_script_change = true;
864 flatbuffers::DetachedBuffer options_fb2 =
865 PackFeatureProcessorOptions(options);
866 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100867 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb2.data()),
868 &unilib);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100869
870 EXPECT_EQ(feature_processor2.Tokenize("앨라배마123웹사이트"),
871 std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
872 Token("웹사이트", 7, 11)}));
873}
874
875#ifdef LIBTEXTCLASSIFIER_TEST_ICU
876TEST(FeatureProcessorTest, ICUTokenize) {
877 FeatureProcessorOptionsT options;
878 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
879
880 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
881 TestingFeatureProcessor feature_processor(
882 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka40c18de2017-04-10 17:22:22 +0200883 std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
884 ASSERT_EQ(tokens,
885 // clang-format off
886 std::vector<Token>({Token("พระบาท", 0, 6),
887 Token("สมเด็จ", 6, 12),
888 Token("พระ", 12, 15),
889 Token("ปร", 15, 17),
890 Token("มิ", 17, 19)}));
891 // clang-format on
892}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100893#endif
Lukas Zilka40c18de2017-04-10 17:22:22 +0200894
Lukas Zilka21d8c982018-01-24 11:11:20 +0100895#ifdef LIBTEXTCLASSIFIER_TEST_ICU
Lukas Zilka40c18de2017-04-10 17:22:22 +0200896TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100897 FeatureProcessorOptionsT options;
898 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
899 options.icu_preserve_whitespace_tokens = true;
Lukas Zilka40c18de2017-04-10 17:22:22 +0200900
Lukas Zilka21d8c982018-01-24 11:11:20 +0100901 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
902 TestingFeatureProcessor feature_processor(
903 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka40c18de2017-04-10 17:22:22 +0200904 std::vector<Token> tokens =
905 feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
906 ASSERT_EQ(tokens,
907 // clang-format off
908 std::vector<Token>({Token("พระบาท", 0, 6),
909 Token(" ", 6, 7),
910 Token("สมเด็จ", 7, 13),
911 Token(" ", 13, 14),
912 Token("พระ", 14, 17),
913 Token(" ", 17, 18),
914 Token("ปร", 18, 20),
915 Token(" ", 20, 21),
916 Token("มิ", 21, 23)}));
917 // clang-format on
918}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100919#endif
Lukas Zilka40c18de2017-04-10 17:22:22 +0200920
Lukas Zilka21d8c982018-01-24 11:11:20 +0100921#ifdef LIBTEXTCLASSIFIER_TEST_ICU
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200922TEST(FeatureProcessorTest, MixedTokenize) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100923 FeatureProcessorOptionsT options;
924 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_MIXED;
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200925
Lukas Zilka21d8c982018-01-24 11:11:20 +0100926 options.tokenization_codepoint_config.emplace_back(
927 new TokenizationCodepointRangeT());
928 auto& config = options.tokenization_codepoint_config.back();
929 config->start = 32;
930 config->end = 33;
931 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200932
Lukas Zilka21d8c982018-01-24 11:11:20 +0100933 {
934 options.internal_tokenizer_codepoint_ranges.emplace_back(
935 new FeatureProcessorOptions_::CodepointRangeT());
936 auto& range = options.internal_tokenizer_codepoint_ranges.back();
937 range->start = 0;
938 range->end = 128;
939 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200940
Lukas Zilka21d8c982018-01-24 11:11:20 +0100941 {
942 options.internal_tokenizer_codepoint_ranges.emplace_back(
943 new FeatureProcessorOptions_::CodepointRangeT());
944 auto& range = options.internal_tokenizer_codepoint_ranges.back();
945 range->start = 128;
946 range->end = 256;
947 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200948
Lukas Zilka21d8c982018-01-24 11:11:20 +0100949 {
950 options.internal_tokenizer_codepoint_ranges.emplace_back(
951 new FeatureProcessorOptions_::CodepointRangeT());
952 auto& range = options.internal_tokenizer_codepoint_ranges.back();
953 range->start = 256;
954 range->end = 384;
955 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200956
Lukas Zilka21d8c982018-01-24 11:11:20 +0100957 {
958 options.internal_tokenizer_codepoint_ranges.emplace_back(
959 new FeatureProcessorOptions_::CodepointRangeT());
960 auto& range = options.internal_tokenizer_codepoint_ranges.back();
961 range->start = 384;
962 range->end = 592;
963 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200964
Lukas Zilka21d8c982018-01-24 11:11:20 +0100965 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
966 TestingFeatureProcessor feature_processor(
967 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200968 std::vector<Token> tokens = feature_processor.Tokenize(
969 "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
970 ASSERT_EQ(tokens,
971 // clang-format off
972 std::vector<Token>({Token("こんにちは", 0, 5),
973 Token("Japanese-ląnguagę", 5, 22),
974 Token("text", 23, 27),
975 Token("世界", 28, 30),
976 Token("http://www.google.com/", 31, 53)}));
977 // clang-format on
978}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100979#endif
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200980
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200981TEST(FeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100982 CREATE_UNILIB_FOR_TESTING;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100983 FeatureProcessorOptionsT options;
984 options.ignored_span_boundary_codepoints.push_back('.');
985 options.ignored_span_boundary_codepoints.push_back(',');
986 options.ignored_span_boundary_codepoints.push_back('[');
987 options.ignored_span_boundary_codepoints.push_back(']');
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200988
Lukas Zilka21d8c982018-01-24 11:11:20 +0100989 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
990 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100991 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
992 &unilib);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200993
994 const std::string text1_utf8 = "ěščř";
995 const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);
996 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
997 text1.begin(), text1.end(),
998 /*count_from_beginning=*/true),
999 0);
1000 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1001 text1.begin(), text1.end(),
1002 /*count_from_beginning=*/false),
1003 0);
1004
1005 const std::string text2_utf8 = ".,abčd";
1006 const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);
1007 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1008 text2.begin(), text2.end(),
1009 /*count_from_beginning=*/true),
1010 2);
1011 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1012 text2.begin(), text2.end(),
1013 /*count_from_beginning=*/false),
1014 0);
1015
1016 const std::string text3_utf8 = ".,abčd[]";
1017 const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);
1018 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1019 text3.begin(), text3.end(),
1020 /*count_from_beginning=*/true),
1021 2);
1022 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1023 text3.begin(), text3.end(),
1024 /*count_from_beginning=*/false),
1025 2);
1026
1027 const std::string text4_utf8 = "[abčd]";
1028 const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);
1029 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1030 text4.begin(), text4.end(),
1031 /*count_from_beginning=*/true),
1032 1);
1033 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1034 text4.begin(), text4.end(),
1035 /*count_from_beginning=*/false),
1036 1);
1037
1038 const std::string text5_utf8 = "";
1039 const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);
1040 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1041 text5.begin(), text5.end(),
1042 /*count_from_beginning=*/true),
1043 0);
1044 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1045 text5.begin(), text5.end(),
1046 /*count_from_beginning=*/false),
1047 0);
1048
1049 const std::string text6_utf8 = "012345ěščř";
1050 const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);
1051 UnicodeText::const_iterator text6_begin = text6.begin();
1052 std::advance(text6_begin, 6);
1053 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1054 text6_begin, text6.end(),
1055 /*count_from_beginning=*/true),
1056 0);
1057 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1058 text6_begin, text6.end(),
1059 /*count_from_beginning=*/false),
1060 0);
1061
1062 const std::string text7_utf8 = "012345.,ěščř";
1063 const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);
1064 UnicodeText::const_iterator text7_begin = text7.begin();
1065 std::advance(text7_begin, 6);
1066 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1067 text7_begin, text7.end(),
1068 /*count_from_beginning=*/true),
1069 2);
1070 UnicodeText::const_iterator text7_end = text7.begin();
1071 std::advance(text7_end, 8);
1072 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1073 text7.begin(), text7_end,
1074 /*count_from_beginning=*/false),
1075 2);
1076
1077 // Test not stripping.
1078 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1079 "Hello [[[Wořld]] or not?", {0, 24}),
1080 std::make_pair(0, 24));
1081 // Test basic stripping.
1082 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1083 "Hello [[[Wořld]] or not?", {6, 16}),
1084 std::make_pair(9, 14));
1085 // Test stripping when everything is stripped.
1086 EXPECT_EQ(
1087 feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),
1088 std::make_pair(6, 6));
1089 // Test stripping empty string.
1090 EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),
1091 std::make_pair(0, 0));
1092}
1093
Lukas Zilka726b4d22017-12-13 16:37:03 +01001094TEST(FeatureProcessorTest, CodepointSpanToTokenSpan) {
1095 const std::vector<Token> tokens{Token("Hělló", 0, 5),
1096 Token("fěěbař@google.com", 6, 23),
1097 Token("heře!", 24, 29)};
1098
1099 // Spans matching the tokens exactly.
1100 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));
1101 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));
1102 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));
1103 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));
1104 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));
1105 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));
1106
1107 // Snapping to containing tokens has no effect.
1108 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));
1109 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));
1110 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));
1111 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));
1112 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));
1113 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));
1114
1115 // Span boundaries inside tokens.
1116 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));
1117 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));
1118
1119 // Tokens adjacent to the span, but not overlapping.
1120 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));
1121 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));
1122}
1123
Matt Sharifid40f9762017-03-14 21:24:23 +01001124} // namespace
Lukas Zilka21d8c982018-01-24 11:11:20 +01001125} // namespace libtextclassifier2