blob: 78977d447c48150323863e5d28e630d140bf9142 [file] [log] [blame]
Matt Sharifid40f9762017-03-14 21:24:23 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Lukas Zilka21d8c982018-01-24 11:11:20 +010017#include "feature-processor.h"
18
19#include "model-executor.h"
20#include "tensor-view.h"
Matt Sharifid40f9762017-03-14 21:24:23 +010021
22#include "gmock/gmock.h"
23#include "gtest/gtest.h"
24
Lukas Zilka21d8c982018-01-24 11:11:20 +010025namespace libtextclassifier2 {
Matt Sharifid40f9762017-03-14 21:24:23 +010026namespace {
27
28using testing::ElementsAreArray;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +020029using testing::FloatEq;
Matt Sharifid40f9762017-03-14 21:24:23 +010030
Lukas Zilka21d8c982018-01-24 11:11:20 +010031flatbuffers::DetachedBuffer PackFeatureProcessorOptions(
32 const FeatureProcessorOptionsT& options) {
33 flatbuffers::FlatBufferBuilder builder;
34 builder.Finish(CreateFeatureProcessorOptions(builder, &options));
35 return builder.Release();
36}
37
Lukas Zilka726b4d22017-12-13 16:37:03 +010038class TestingFeatureProcessor : public FeatureProcessor {
39 public:
40 using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
41 using FeatureProcessor::FeatureProcessor;
42 using FeatureProcessor::ICUTokenize;
43 using FeatureProcessor::IsCodepointInRanges;
44 using FeatureProcessor::SpanToLabel;
45 using FeatureProcessor::StripTokensFromOtherLines;
46 using FeatureProcessor::supported_codepoint_ranges_;
47 using FeatureProcessor::SupportedCodepointsRatio;
48};
49
Lukas Zilka21d8c982018-01-24 11:11:20 +010050// EmbeddingExecutor that always returns features based on
51class FakeEmbeddingExecutor : public EmbeddingExecutor {
52 public:
53 bool AddEmbedding(const TensorView<int>& sparse_features, float* dest,
54 int dest_size) override {
55 TC_CHECK_GE(dest_size, 4);
56 EXPECT_EQ(sparse_features.size(), 1);
57 dest[0] = sparse_features.data()[0];
58 dest[1] = sparse_features.data()[0];
59 dest[2] = -sparse_features.data()[0];
60 dest[3] = -sparse_features.data()[0];
61 return true;
62 }
63
64 private:
65 std::vector<float> storage_;
66};
67
Matt Sharifid40f9762017-03-14 21:24:23 +010068TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020069 std::vector<Token> tokens{Token("Hělló", 0, 5),
70 Token("fěěbař@google.com", 6, 23),
71 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010072
73 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
74
75 // clang-format off
76 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020077 {Token("Hělló", 0, 5),
78 Token("fěě", 6, 9),
79 Token("bař", 9, 12),
80 Token("@google.com", 12, 23),
81 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010082 // clang-format on
83}
84
85TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020086 std::vector<Token> tokens{Token("Hělló", 0, 5),
87 Token("fěěbař@google.com", 6, 23),
88 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010089
90 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
91
92 // clang-format off
93 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020094 {Token("Hělló", 0, 5),
95 Token("fěěbař", 6, 12),
96 Token("@google.com", 12, 23),
97 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010098 // clang-format on
99}
100
101TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200102 std::vector<Token> tokens{Token("Hělló", 0, 5),
103 Token("fěěbař@google.com", 6, 23),
104 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100105
106 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
107
108 // clang-format off
109 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200110 {Token("Hělló", 0, 5),
111 Token("fěě", 6, 9),
112 Token("bař@google.com", 9, 23),
113 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100114 // clang-format on
115}
116
117TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200118 std::vector<Token> tokens{Token("Hělló", 0, 5),
119 Token("fěěbař@google.com", 6, 23),
120 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100121
122 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
123
124 // clang-format off
125 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200126 {Token("Hělló", 0, 5),
127 Token("fěěbař@google.com", 6, 23),
128 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100129 // clang-format on
130}
131
132TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200133 std::vector<Token> tokens{Token("Hělló", 0, 5),
134 Token("fěěbař@google.com", 6, 23),
135 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100136
137 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
138
139 // clang-format off
140 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200141 {Token("Hě", 0, 2),
142 Token("lló", 2, 5),
143 Token("fěě", 6, 9),
144 Token("bař@google.com", 9, 23),
145 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100146 // clang-format on
147}
148
149TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100150 CREATE_UNILIB_FOR_TESTING
Lukas Zilka21d8c982018-01-24 11:11:20 +0100151 FeatureProcessorOptionsT options;
152 options.only_use_line_with_click = true;
153 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
154 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100155 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
156 &unilib);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100157
Matt Sharifibe876dc2017-03-17 17:02:43 +0100158 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
159 const CodepointSpan span = {0, 5};
160 // clang-format off
161 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
162 Token("Lině", 6, 10),
163 Token("Sěcond", 11, 17),
164 Token("Lině", 18, 22),
165 Token("Thiřd", 23, 28),
166 Token("Lině", 29, 33)};
167 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100168
169 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100170 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100171 EXPECT_THAT(tokens,
172 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100173}
174
175TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100176 CREATE_UNILIB_FOR_TESTING
Lukas Zilka21d8c982018-01-24 11:11:20 +0100177 FeatureProcessorOptionsT options;
178 options.only_use_line_with_click = true;
179 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
180 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100181 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
182 &unilib);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100183
Matt Sharifibe876dc2017-03-17 17:02:43 +0100184 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
185 const CodepointSpan span = {18, 22};
186 // clang-format off
187 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
188 Token("Lině", 6, 10),
189 Token("Sěcond", 11, 17),
190 Token("Lině", 18, 22),
191 Token("Thiřd", 23, 28),
192 Token("Lině", 29, 33)};
193 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100194
Matt Sharifibe876dc2017-03-17 17:02:43 +0100195 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100196 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100197 EXPECT_THAT(tokens, ElementsAreArray(
198 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100199}
200
201TEST(FeatureProcessorTest, KeepLineWithClickThird) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100202 CREATE_UNILIB_FOR_TESTING
Lukas Zilka21d8c982018-01-24 11:11:20 +0100203 FeatureProcessorOptionsT options;
204 options.only_use_line_with_click = true;
205 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
206 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100207 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
208 &unilib);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100209
Matt Sharifibe876dc2017-03-17 17:02:43 +0100210 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
211 const CodepointSpan span = {24, 33};
212 // clang-format off
213 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
214 Token("Lině", 6, 10),
215 Token("Sěcond", 11, 17),
216 Token("Lině", 18, 22),
217 Token("Thiřd", 23, 28),
218 Token("Lině", 29, 33)};
219 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100220
Matt Sharifibe876dc2017-03-17 17:02:43 +0100221 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100222 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100223 EXPECT_THAT(tokens, ElementsAreArray(
224 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100225}
226
227TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100228 CREATE_UNILIB_FOR_TESTING
Lukas Zilka21d8c982018-01-24 11:11:20 +0100229 FeatureProcessorOptionsT options;
230 options.only_use_line_with_click = true;
231 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
232 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100233 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
234 &unilib);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100235
Matt Sharifibe876dc2017-03-17 17:02:43 +0100236 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
237 const CodepointSpan span = {18, 22};
238 // clang-format off
239 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
240 Token("Lině", 6, 10),
241 Token("Sěcond", 11, 17),
242 Token("Lině", 18, 22),
243 Token("Thiřd", 23, 28),
244 Token("Lině", 29, 33)};
245 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100246
Matt Sharifibe876dc2017-03-17 17:02:43 +0100247 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100248 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100249 EXPECT_THAT(tokens, ElementsAreArray(
250 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100251}
252
253TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100254 CREATE_UNILIB_FOR_TESTING
Lukas Zilka21d8c982018-01-24 11:11:20 +0100255 FeatureProcessorOptionsT options;
256 options.only_use_line_with_click = true;
257 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
258 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100259 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
260 &unilib);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100261
Matt Sharifibe876dc2017-03-17 17:02:43 +0100262 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
263 const CodepointSpan span = {5, 23};
264 // clang-format off
265 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
266 Token("Lině", 6, 10),
267 Token("Sěcond", 18, 23),
268 Token("Lině", 19, 23),
269 Token("Thiřd", 23, 28),
270 Token("Lině", 29, 33)};
271 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100272
Matt Sharifibe876dc2017-03-17 17:02:43 +0100273 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100274 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100275 EXPECT_THAT(tokens, ElementsAreArray(
276 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
277 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
278 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100279}
280
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200281TEST(FeatureProcessorTest, SpanToLabel) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100282 CREATE_UNILIB_FOR_TESTING
Lukas Zilka21d8c982018-01-24 11:11:20 +0100283 FeatureProcessorOptionsT options;
284 options.context_size = 1;
285 options.max_selection_span = 1;
286 options.snap_label_span_boundaries_to_containing_tokens = false;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200287
Lukas Zilka21d8c982018-01-24 11:11:20 +0100288 options.tokenization_codepoint_config.emplace_back(
289 new TokenizationCodepointRangeT());
290 auto& config = options.tokenization_codepoint_config.back();
291 config->start = 32;
292 config->end = 33;
293 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200294
Lukas Zilka21d8c982018-01-24 11:11:20 +0100295 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
296 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100297 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
298 &unilib);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200299 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
300 ASSERT_EQ(3, tokens.size());
301 int label;
302 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
303 EXPECT_EQ(kInvalidLabel, label);
304 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
305 EXPECT_NE(kInvalidLabel, label);
306 TokenSpan token_span;
307 feature_processor.LabelToTokenSpan(label, &token_span);
308 EXPECT_EQ(0, token_span.first);
309 EXPECT_EQ(0, token_span.second);
310
311 // Reconfigure with snapping enabled.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100312 options.snap_label_span_boundaries_to_containing_tokens = true;
313 flatbuffers::DetachedBuffer options2_fb =
314 PackFeatureProcessorOptions(options);
315 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100316 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
317 &unilib);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200318 int label2;
319 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
320 EXPECT_EQ(label, label2);
321 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
322 EXPECT_EQ(label, label2);
323 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
324 EXPECT_EQ(label, label2);
325
326 // Cross a token boundary.
327 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
328 EXPECT_EQ(kInvalidLabel, label2);
329 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
330 EXPECT_EQ(kInvalidLabel, label2);
331
332 // Multiple tokens.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100333 options.context_size = 2;
334 options.max_selection_span = 2;
335 flatbuffers::DetachedBuffer options3_fb =
336 PackFeatureProcessorOptions(options);
337 TestingFeatureProcessor feature_processor3(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100338 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
339 &unilib);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200340 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
341 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
342 EXPECT_NE(kInvalidLabel, label2);
343 feature_processor3.LabelToTokenSpan(label2, &token_span);
344 EXPECT_EQ(1, token_span.first);
345 EXPECT_EQ(0, token_span.second);
346
347 int label3;
348 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
349 EXPECT_EQ(label2, label3);
350 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
351 EXPECT_EQ(label2, label3);
352 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
353 EXPECT_EQ(label2, label3);
354}
355
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200356TEST(FeatureProcessorTest, SpanToLabelIgnoresPunctuation) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100357 CREATE_UNILIB_FOR_TESTING
Lukas Zilka21d8c982018-01-24 11:11:20 +0100358 FeatureProcessorOptionsT options;
359 options.context_size = 1;
360 options.max_selection_span = 1;
361 options.snap_label_span_boundaries_to_containing_tokens = false;
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200362
Lukas Zilka21d8c982018-01-24 11:11:20 +0100363 options.tokenization_codepoint_config.emplace_back(
364 new TokenizationCodepointRangeT());
365 auto& config = options.tokenization_codepoint_config.back();
366 config->start = 32;
367 config->end = 33;
368 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200369
Lukas Zilka21d8c982018-01-24 11:11:20 +0100370 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
371 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100372 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
373 &unilib);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200374 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
375 ASSERT_EQ(3, tokens.size());
376 int label;
377 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
378 EXPECT_EQ(kInvalidLabel, label);
379 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
380 EXPECT_NE(kInvalidLabel, label);
381 TokenSpan token_span;
382 feature_processor.LabelToTokenSpan(label, &token_span);
383 EXPECT_EQ(0, token_span.first);
384 EXPECT_EQ(0, token_span.second);
385
386 // Reconfigure with snapping enabled.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100387 options.snap_label_span_boundaries_to_containing_tokens = true;
388 flatbuffers::DetachedBuffer options2_fb =
389 PackFeatureProcessorOptions(options);
390 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100391 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
392 &unilib);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200393 int label2;
394 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
395 EXPECT_EQ(label, label2);
396 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
397 EXPECT_EQ(label, label2);
398 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
399 EXPECT_EQ(label, label2);
400
401 // Cross a token boundary.
402 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
403 EXPECT_EQ(kInvalidLabel, label2);
404 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
405 EXPECT_EQ(kInvalidLabel, label2);
406
407 // Multiple tokens.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100408 options.context_size = 2;
409 options.max_selection_span = 2;
410 flatbuffers::DetachedBuffer options3_fb =
411 PackFeatureProcessorOptions(options);
412 TestingFeatureProcessor feature_processor3(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100413 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
414 &unilib);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200415 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
416 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
417 EXPECT_NE(kInvalidLabel, label2);
418 feature_processor3.LabelToTokenSpan(label2, &token_span);
419 EXPECT_EQ(1, token_span.first);
420 EXPECT_EQ(0, token_span.second);
421
422 int label3;
423 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
424 EXPECT_EQ(label2, label3);
425 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
426 EXPECT_EQ(label2, label3);
427 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
428 EXPECT_EQ(label2, label3);
429}
430
Matt Sharifibe876dc2017-03-17 17:02:43 +0100431TEST(FeatureProcessorTest, CenterTokenFromClick) {
432 int token_index;
433
434 // Exactly aligned indices.
435 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200436 {6, 11},
437 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100438 EXPECT_EQ(token_index, 1);
439
440 // Click is contained in a token.
441 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200442 {13, 17},
443 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100444 EXPECT_EQ(token_index, 2);
445
446 // Click spans two tokens.
447 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200448 {6, 17},
449 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100450 EXPECT_EQ(token_index, kInvalidIndex);
451}
452
453TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100454 int token_index;
455
456 // Selection of length 3. Exactly aligned indices.
457 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200458 {7, 27},
459 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
460 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100461 EXPECT_EQ(token_index, 2);
462
463 // Selection of length 1 token. Exactly aligned indices.
464 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200465 {21, 27},
466 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
467 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100468 EXPECT_EQ(token_index, 3);
469
470 // Selection marks sub-token range, with no tokens in it.
471 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200472 {29, 33},
473 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
474 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100475 EXPECT_EQ(token_index, kInvalidIndex);
476
477 // Selection of length 2. Sub-token indices.
478 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200479 {3, 25},
480 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
481 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100482 EXPECT_EQ(token_index, 1);
483
484 // Selection of length 1. Sub-token indices.
485 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200486 {22, 34},
487 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
488 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100489 EXPECT_EQ(token_index, 4);
Alex Salcianu9087f1f2017-03-22 21:22:39 -0400490
491 // Some invalid ones.
492 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
493 EXPECT_EQ(token_index, -1);
494}
495
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200496TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100497 FeatureProcessorOptionsT options;
498 options.context_size = 2;
499 options.max_selection_span = 2;
500 options.snap_label_span_boundaries_to_containing_tokens = false;
501 options.feature_version = 2;
502 options.embedding_size = 4;
503 options.bounds_sensitive_features.reset(
504 new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
505 options.bounds_sensitive_features->enabled = true;
506 options.bounds_sensitive_features->num_tokens_before = 5;
507 options.bounds_sensitive_features->num_tokens_inside_left = 3;
508 options.bounds_sensitive_features->num_tokens_inside_right = 3;
509 options.bounds_sensitive_features->num_tokens_after = 5;
510 options.bounds_sensitive_features->include_inside_bag = true;
511 options.bounds_sensitive_features->include_inside_length = true;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200512
Lukas Zilka21d8c982018-01-24 11:11:20 +0100513 options.tokenization_codepoint_config.emplace_back(
514 new TokenizationCodepointRangeT());
515 auto& config = options.tokenization_codepoint_config.back();
516 config->start = 32;
517 config->end = 33;
518 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200519
Lukas Zilka21d8c982018-01-24 11:11:20 +0100520 {
521 options.supported_codepoint_ranges.emplace_back(
522 new FeatureProcessorOptions_::CodepointRangeT());
523 auto& range = options.supported_codepoint_ranges.back();
524 range->start = 0;
525 range->end = 128;
526 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200527
Lukas Zilka21d8c982018-01-24 11:11:20 +0100528 {
529 options.supported_codepoint_ranges.emplace_back(
530 new FeatureProcessorOptions_::CodepointRangeT());
531 auto& range = options.supported_codepoint_ranges.back();
532 range->start = 10000;
533 range->end = 10001;
534 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200535
Lukas Zilka21d8c982018-01-24 11:11:20 +0100536 {
537 options.supported_codepoint_ranges.emplace_back(
538 new FeatureProcessorOptions_::CodepointRangeT());
539 auto& range = options.supported_codepoint_ranges.back();
540 range->start = 20000;
541 range->end = 30000;
542 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200543
Lukas Zilka21d8c982018-01-24 11:11:20 +0100544 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100545 CREATE_UNILIB_FOR_TESTING
Lukas Zilka21d8c982018-01-24 11:11:20 +0100546 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100547 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
548 &unilib);
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200549 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100550 {0, 3}, feature_processor.Tokenize("aaa bbb ccc")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200551 FloatEq(1.0));
552 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100553 {0, 3}, feature_processor.Tokenize("aaa bbb ěěě")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200554 FloatEq(2.0 / 3));
555 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100556 {0, 3}, feature_processor.Tokenize("ěěě řřř ěěě")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200557 FloatEq(0.0));
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200558 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
559 -1, feature_processor.supported_codepoint_ranges_));
560 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
561 0, feature_processor.supported_codepoint_ranges_));
562 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
563 10, feature_processor.supported_codepoint_ranges_));
564 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
565 127, feature_processor.supported_codepoint_ranges_));
566 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
567 128, feature_processor.supported_codepoint_ranges_));
568 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
569 9999, feature_processor.supported_codepoint_ranges_));
570 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
571 10000, feature_processor.supported_codepoint_ranges_));
572 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
573 10001, feature_processor.supported_codepoint_ranges_));
574 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
575 25000, feature_processor.supported_codepoint_ranges_));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200576
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200577 std::unique_ptr<CachedFeatures> cached_features;
578
Lukas Zilka21d8c982018-01-24 11:11:20 +0100579 FakeEmbeddingExecutor embedding_executor;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200580
Lukas Zilka21d8c982018-01-24 11:11:20 +0100581 const std::vector<Token> tokens = {Token("ěěě", 0, 3), Token("řřř", 4, 7),
582 Token("eee", 8, 11)};
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200583
Lukas Zilka21d8c982018-01-24 11:11:20 +0100584 options.min_supported_codepoint_ratio = 0.0;
585 flatbuffers::DetachedBuffer options2_fb =
586 PackFeatureProcessorOptions(options);
587 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100588 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
589 &unilib);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100590 EXPECT_TRUE(feature_processor2.ExtractFeatures(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100591 tokens, /*token_span=*/{0, 3},
592 /*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
593 &embedding_executor,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100594 /*feature_vector_size=*/4, &cached_features));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200595
Lukas Zilka21d8c982018-01-24 11:11:20 +0100596 options.min_supported_codepoint_ratio = 0.2;
597 flatbuffers::DetachedBuffer options3_fb =
598 PackFeatureProcessorOptions(options);
599 TestingFeatureProcessor feature_processor3(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100600 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
601 &unilib);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100602 EXPECT_TRUE(feature_processor3.ExtractFeatures(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100603 tokens, /*token_span=*/{0, 3},
604 /*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
605 &embedding_executor,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100606 /*feature_vector_size=*/4, &cached_features));
607
608 options.min_supported_codepoint_ratio = 0.5;
609 flatbuffers::DetachedBuffer options4_fb =
610 PackFeatureProcessorOptions(options);
611 TestingFeatureProcessor feature_processor4(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100612 flatbuffers::GetRoot<FeatureProcessorOptions>(options4_fb.data()),
613 &unilib);
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200614 EXPECT_FALSE(feature_processor4.ExtractFeatures(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100615 tokens, /*token_span=*/{0, 3},
616 /*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
617 &embedding_executor,
Lukas Zilka21d8c982018-01-24 11:11:20 +0100618 /*feature_vector_size=*/4, &cached_features));
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200619}
620
Lukas Zilkab23e2122018-02-09 10:25:19 +0100621TEST(FeatureProcessorTest, InSpanFeature) {
622 FeatureProcessorOptionsT options;
623 options.context_size = 2;
624 options.max_selection_span = 2;
625 options.snap_label_span_boundaries_to_containing_tokens = false;
626 options.feature_version = 2;
627 options.embedding_size = 4;
628 options.extract_selection_mask_feature = true;
629
630 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
631 CREATE_UNILIB_FOR_TESTING
632 TestingFeatureProcessor feature_processor(
633 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
634 &unilib);
635
636 std::unique_ptr<CachedFeatures> cached_features;
637
638 FakeEmbeddingExecutor embedding_executor;
639
640 const std::vector<Token> tokens = {Token("aaa", 0, 3), Token("bbb", 4, 7),
641 Token("ccc", 8, 11), Token("ddd", 12, 15)};
642
643 EXPECT_TRUE(feature_processor.ExtractFeatures(
644 tokens, /*token_span=*/{0, 4},
645 /*selection_span_for_feature=*/{4, 11}, &embedding_executor,
646 /*feature_vector_size=*/5, &cached_features));
647 std::vector<float> features;
648 cached_features->AppendClickContextFeaturesForClick(1, &features);
649 ASSERT_EQ(features.size(), 25);
650 EXPECT_THAT(features[4], FloatEq(0.0));
651 EXPECT_THAT(features[9], FloatEq(0.0));
652 EXPECT_THAT(features[14], FloatEq(1.0));
653 EXPECT_THAT(features[19], FloatEq(1.0));
654 EXPECT_THAT(features[24], FloatEq(0.0));
655}
656
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200657TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
658 std::vector<Token> tokens_orig{
659 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
660 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
661 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
662 Token("12", 0, 0)};
663
664 std::vector<Token> tokens;
665 int click_index;
666
667 // Try to click first token and see if it gets padded from left.
668 tokens = tokens_orig;
669 click_index = 0;
670 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
671 // clang-format off
672 EXPECT_EQ(tokens, std::vector<Token>({Token(),
673 Token(),
674 Token("0", 0, 0),
675 Token("1", 0, 0),
676 Token("2", 0, 0)}));
677 // clang-format on
678 EXPECT_EQ(click_index, 2);
679
680 // When we click the second token nothing should get padded.
681 tokens = tokens_orig;
682 click_index = 2;
683 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
684 // clang-format off
685 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
686 Token("1", 0, 0),
687 Token("2", 0, 0),
688 Token("3", 0, 0),
689 Token("4", 0, 0)}));
690 // clang-format on
691 EXPECT_EQ(click_index, 2);
692
693 // When we click the last token tokens should get padded from the right.
694 tokens = tokens_orig;
695 click_index = 12;
696 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
697 // clang-format off
698 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
699 Token("11", 0, 0),
700 Token("12", 0, 0),
701 Token(),
702 Token()}));
703 // clang-format on
704 EXPECT_EQ(click_index, 2);
705}
706
707TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
708 std::vector<Token> tokens_orig{
709 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
710 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
711 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
712 Token("12", 0, 0)};
713
714 std::vector<Token> tokens;
715 int click_index;
716
717 // Try to click first token and see if it gets padded from left to maximum
718 // context_size.
719 tokens = tokens_orig;
720 click_index = 0;
721 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
722 // clang-format off
723 EXPECT_EQ(tokens, std::vector<Token>({Token(),
724 Token(),
725 Token("0", 0, 0),
726 Token("1", 0, 0),
727 Token("2", 0, 0),
728 Token("3", 0, 0),
729 Token("4", 0, 0),
730 Token("5", 0, 0)}));
731 // clang-format on
732 EXPECT_EQ(click_index, 2);
733
734 // Clicking to the middle with enough context should not produce any padding.
735 tokens = tokens_orig;
736 click_index = 6;
737 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
738 // clang-format off
739 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
740 Token("2", 0, 0),
741 Token("3", 0, 0),
742 Token("4", 0, 0),
743 Token("5", 0, 0),
744 Token("6", 0, 0),
745 Token("7", 0, 0),
746 Token("8", 0, 0),
747 Token("9", 0, 0)}));
748 // clang-format on
749 EXPECT_EQ(click_index, 5);
750
751 // Clicking at the end should pad right to maximum context_size.
752 tokens = tokens_orig;
753 click_index = 11;
754 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
755 // clang-format off
756 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
757 Token("7", 0, 0),
758 Token("8", 0, 0),
759 Token("9", 0, 0),
760 Token("10", 0, 0),
761 Token("11", 0, 0),
762 Token("12", 0, 0),
763 Token(),
764 Token()}));
765 // clang-format on
766 EXPECT_EQ(click_index, 5);
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200767}
768
Lukas Zilka21d8c982018-01-24 11:11:20 +0100769TEST(FeatureProcessorTest, InternalTokenizeOnScriptChange) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100770 CREATE_UNILIB_FOR_TESTING
Lukas Zilka21d8c982018-01-24 11:11:20 +0100771 FeatureProcessorOptionsT options;
772 options.tokenization_codepoint_config.emplace_back(
773 new TokenizationCodepointRangeT());
774 {
775 auto& config = options.tokenization_codepoint_config.back();
776 config->start = 0;
777 config->end = 256;
778 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
779 config->script_id = 1;
780 }
781 options.tokenize_on_script_change = false;
Lukas Zilka40c18de2017-04-10 17:22:22 +0200782
Lukas Zilka21d8c982018-01-24 11:11:20 +0100783 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
784 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100785 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
786 &unilib);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100787
788 EXPECT_EQ(feature_processor.Tokenize("앨라배마123웹사이트"),
789 std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
790
791 options.tokenize_on_script_change = true;
792 flatbuffers::DetachedBuffer options_fb2 =
793 PackFeatureProcessorOptions(options);
794 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100795 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb2.data()),
796 &unilib);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100797
798 EXPECT_EQ(feature_processor2.Tokenize("앨라배마123웹사이트"),
799 std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
800 Token("웹사이트", 7, 11)}));
801}
802
803#ifdef LIBTEXTCLASSIFIER_TEST_ICU
804TEST(FeatureProcessorTest, ICUTokenize) {
805 FeatureProcessorOptionsT options;
806 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
807
808 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
809 TestingFeatureProcessor feature_processor(
810 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka40c18de2017-04-10 17:22:22 +0200811 std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
812 ASSERT_EQ(tokens,
813 // clang-format off
814 std::vector<Token>({Token("พระบาท", 0, 6),
815 Token("สมเด็จ", 6, 12),
816 Token("พระ", 12, 15),
817 Token("ปร", 15, 17),
818 Token("มิ", 17, 19)}));
819 // clang-format on
820}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100821#endif
Lukas Zilka40c18de2017-04-10 17:22:22 +0200822
Lukas Zilka21d8c982018-01-24 11:11:20 +0100823#ifdef LIBTEXTCLASSIFIER_TEST_ICU
Lukas Zilka40c18de2017-04-10 17:22:22 +0200824TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100825 FeatureProcessorOptionsT options;
826 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
827 options.icu_preserve_whitespace_tokens = true;
Lukas Zilka40c18de2017-04-10 17:22:22 +0200828
Lukas Zilka21d8c982018-01-24 11:11:20 +0100829 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
830 TestingFeatureProcessor feature_processor(
831 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka40c18de2017-04-10 17:22:22 +0200832 std::vector<Token> tokens =
833 feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
834 ASSERT_EQ(tokens,
835 // clang-format off
836 std::vector<Token>({Token("พระบาท", 0, 6),
837 Token(" ", 6, 7),
838 Token("สมเด็จ", 7, 13),
839 Token(" ", 13, 14),
840 Token("พระ", 14, 17),
841 Token(" ", 17, 18),
842 Token("ปร", 18, 20),
843 Token(" ", 20, 21),
844 Token("มิ", 21, 23)}));
845 // clang-format on
846}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100847#endif
Lukas Zilka40c18de2017-04-10 17:22:22 +0200848
Lukas Zilka21d8c982018-01-24 11:11:20 +0100849#ifdef LIBTEXTCLASSIFIER_TEST_ICU
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200850TEST(FeatureProcessorTest, MixedTokenize) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100851 FeatureProcessorOptionsT options;
852 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_MIXED;
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200853
Lukas Zilka21d8c982018-01-24 11:11:20 +0100854 options.tokenization_codepoint_config.emplace_back(
855 new TokenizationCodepointRangeT());
856 auto& config = options.tokenization_codepoint_config.back();
857 config->start = 32;
858 config->end = 33;
859 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200860
Lukas Zilka21d8c982018-01-24 11:11:20 +0100861 {
862 options.internal_tokenizer_codepoint_ranges.emplace_back(
863 new FeatureProcessorOptions_::CodepointRangeT());
864 auto& range = options.internal_tokenizer_codepoint_ranges.back();
865 range->start = 0;
866 range->end = 128;
867 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200868
Lukas Zilka21d8c982018-01-24 11:11:20 +0100869 {
870 options.internal_tokenizer_codepoint_ranges.emplace_back(
871 new FeatureProcessorOptions_::CodepointRangeT());
872 auto& range = options.internal_tokenizer_codepoint_ranges.back();
873 range->start = 128;
874 range->end = 256;
875 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200876
Lukas Zilka21d8c982018-01-24 11:11:20 +0100877 {
878 options.internal_tokenizer_codepoint_ranges.emplace_back(
879 new FeatureProcessorOptions_::CodepointRangeT());
880 auto& range = options.internal_tokenizer_codepoint_ranges.back();
881 range->start = 256;
882 range->end = 384;
883 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200884
Lukas Zilka21d8c982018-01-24 11:11:20 +0100885 {
886 options.internal_tokenizer_codepoint_ranges.emplace_back(
887 new FeatureProcessorOptions_::CodepointRangeT());
888 auto& range = options.internal_tokenizer_codepoint_ranges.back();
889 range->start = 384;
890 range->end = 592;
891 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200892
Lukas Zilka21d8c982018-01-24 11:11:20 +0100893 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
894 TestingFeatureProcessor feature_processor(
895 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200896 std::vector<Token> tokens = feature_processor.Tokenize(
897 "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
898 ASSERT_EQ(tokens,
899 // clang-format off
900 std::vector<Token>({Token("こんにちは", 0, 5),
901 Token("Japanese-ląnguagę", 5, 22),
902 Token("text", 23, 27),
903 Token("世界", 28, 30),
904 Token("http://www.google.com/", 31, 53)}));
905 // clang-format on
906}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100907#endif
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200908
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200909TEST(FeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100910 CREATE_UNILIB_FOR_TESTING
Lukas Zilka21d8c982018-01-24 11:11:20 +0100911 FeatureProcessorOptionsT options;
912 options.ignored_span_boundary_codepoints.push_back('.');
913 options.ignored_span_boundary_codepoints.push_back(',');
914 options.ignored_span_boundary_codepoints.push_back('[');
915 options.ignored_span_boundary_codepoints.push_back(']');
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200916
Lukas Zilka21d8c982018-01-24 11:11:20 +0100917 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
918 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100919 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
920 &unilib);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200921
922 const std::string text1_utf8 = "ěščř";
923 const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);
924 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
925 text1.begin(), text1.end(),
926 /*count_from_beginning=*/true),
927 0);
928 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
929 text1.begin(), text1.end(),
930 /*count_from_beginning=*/false),
931 0);
932
933 const std::string text2_utf8 = ".,abčd";
934 const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);
935 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
936 text2.begin(), text2.end(),
937 /*count_from_beginning=*/true),
938 2);
939 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
940 text2.begin(), text2.end(),
941 /*count_from_beginning=*/false),
942 0);
943
944 const std::string text3_utf8 = ".,abčd[]";
945 const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);
946 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
947 text3.begin(), text3.end(),
948 /*count_from_beginning=*/true),
949 2);
950 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
951 text3.begin(), text3.end(),
952 /*count_from_beginning=*/false),
953 2);
954
955 const std::string text4_utf8 = "[abčd]";
956 const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);
957 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
958 text4.begin(), text4.end(),
959 /*count_from_beginning=*/true),
960 1);
961 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
962 text4.begin(), text4.end(),
963 /*count_from_beginning=*/false),
964 1);
965
966 const std::string text5_utf8 = "";
967 const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);
968 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
969 text5.begin(), text5.end(),
970 /*count_from_beginning=*/true),
971 0);
972 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
973 text5.begin(), text5.end(),
974 /*count_from_beginning=*/false),
975 0);
976
977 const std::string text6_utf8 = "012345ěščř";
978 const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);
979 UnicodeText::const_iterator text6_begin = text6.begin();
980 std::advance(text6_begin, 6);
981 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
982 text6_begin, text6.end(),
983 /*count_from_beginning=*/true),
984 0);
985 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
986 text6_begin, text6.end(),
987 /*count_from_beginning=*/false),
988 0);
989
990 const std::string text7_utf8 = "012345.,ěščř";
991 const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);
992 UnicodeText::const_iterator text7_begin = text7.begin();
993 std::advance(text7_begin, 6);
994 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
995 text7_begin, text7.end(),
996 /*count_from_beginning=*/true),
997 2);
998 UnicodeText::const_iterator text7_end = text7.begin();
999 std::advance(text7_end, 8);
1000 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1001 text7.begin(), text7_end,
1002 /*count_from_beginning=*/false),
1003 2);
1004
1005 // Test not stripping.
1006 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1007 "Hello [[[Wořld]] or not?", {0, 24}),
1008 std::make_pair(0, 24));
1009 // Test basic stripping.
1010 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1011 "Hello [[[Wořld]] or not?", {6, 16}),
1012 std::make_pair(9, 14));
1013 // Test stripping when everything is stripped.
1014 EXPECT_EQ(
1015 feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),
1016 std::make_pair(6, 6));
1017 // Test stripping empty string.
1018 EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),
1019 std::make_pair(0, 0));
1020}
1021
Lukas Zilka726b4d22017-12-13 16:37:03 +01001022TEST(FeatureProcessorTest, CodepointSpanToTokenSpan) {
1023 const std::vector<Token> tokens{Token("Hělló", 0, 5),
1024 Token("fěěbař@google.com", 6, 23),
1025 Token("heře!", 24, 29)};
1026
1027 // Spans matching the tokens exactly.
1028 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));
1029 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));
1030 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));
1031 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));
1032 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));
1033 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));
1034
1035 // Snapping to containing tokens has no effect.
1036 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));
1037 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));
1038 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));
1039 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));
1040 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));
1041 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));
1042
1043 // Span boundaries inside tokens.
1044 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));
1045 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));
1046
1047 // Tokens adjacent to the span, but not overlapping.
1048 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));
1049 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));
1050}
1051
Matt Sharifid40f9762017-03-14 21:24:23 +01001052} // namespace
Lukas Zilka21d8c982018-01-24 11:11:20 +01001053} // namespace libtextclassifier2