blob: 5af8b962a00fc3a98b4450606fc3834ceffd2324 [file] [log] [blame]
Matt Sharifid40f9762017-03-14 21:24:23 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Lukas Zilka21d8c982018-01-24 11:11:20 +010017#include "feature-processor.h"
18
19#include "model-executor.h"
20#include "tensor-view.h"
Matt Sharifid40f9762017-03-14 21:24:23 +010021
22#include "gmock/gmock.h"
23#include "gtest/gtest.h"
24
Lukas Zilka21d8c982018-01-24 11:11:20 +010025namespace libtextclassifier2 {
Matt Sharifid40f9762017-03-14 21:24:23 +010026namespace {
27
28using testing::ElementsAreArray;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +020029using testing::FloatEq;
Matt Sharifid40f9762017-03-14 21:24:23 +010030
Lukas Zilka21d8c982018-01-24 11:11:20 +010031flatbuffers::DetachedBuffer PackFeatureProcessorOptions(
32 const FeatureProcessorOptionsT& options) {
33 flatbuffers::FlatBufferBuilder builder;
34 builder.Finish(CreateFeatureProcessorOptions(builder, &options));
35 return builder.Release();
36}
37
Lukas Zilka726b4d22017-12-13 16:37:03 +010038class TestingFeatureProcessor : public FeatureProcessor {
39 public:
40 using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
41 using FeatureProcessor::FeatureProcessor;
42 using FeatureProcessor::ICUTokenize;
43 using FeatureProcessor::IsCodepointInRanges;
44 using FeatureProcessor::SpanToLabel;
45 using FeatureProcessor::StripTokensFromOtherLines;
46 using FeatureProcessor::supported_codepoint_ranges_;
47 using FeatureProcessor::SupportedCodepointsRatio;
48};
49
Lukas Zilka21d8c982018-01-24 11:11:20 +010050// EmbeddingExecutor that always returns features based on
51class FakeEmbeddingExecutor : public EmbeddingExecutor {
52 public:
53 bool AddEmbedding(const TensorView<int>& sparse_features, float* dest,
54 int dest_size) override {
55 TC_CHECK_GE(dest_size, 4);
56 EXPECT_EQ(sparse_features.size(), 1);
57 dest[0] = sparse_features.data()[0];
58 dest[1] = sparse_features.data()[0];
59 dest[2] = -sparse_features.data()[0];
60 dest[3] = -sparse_features.data()[0];
61 return true;
62 }
63
64 private:
65 std::vector<float> storage_;
66};
67
Matt Sharifid40f9762017-03-14 21:24:23 +010068TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020069 std::vector<Token> tokens{Token("Hělló", 0, 5),
70 Token("fěěbař@google.com", 6, 23),
71 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010072
73 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
74
75 // clang-format off
76 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020077 {Token("Hělló", 0, 5),
78 Token("fěě", 6, 9),
79 Token("bař", 9, 12),
80 Token("@google.com", 12, 23),
81 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010082 // clang-format on
83}
84
85TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020086 std::vector<Token> tokens{Token("Hělló", 0, 5),
87 Token("fěěbař@google.com", 6, 23),
88 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010089
90 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
91
92 // clang-format off
93 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020094 {Token("Hělló", 0, 5),
95 Token("fěěbař", 6, 12),
96 Token("@google.com", 12, 23),
97 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +010098 // clang-format on
99}
100
101TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200102 std::vector<Token> tokens{Token("Hělló", 0, 5),
103 Token("fěěbař@google.com", 6, 23),
104 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100105
106 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
107
108 // clang-format off
109 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200110 {Token("Hělló", 0, 5),
111 Token("fěě", 6, 9),
112 Token("bař@google.com", 9, 23),
113 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100114 // clang-format on
115}
116
117TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200118 std::vector<Token> tokens{Token("Hělló", 0, 5),
119 Token("fěěbař@google.com", 6, 23),
120 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100121
122 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
123
124 // clang-format off
125 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200126 {Token("Hělló", 0, 5),
127 Token("fěěbař@google.com", 6, 23),
128 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100129 // clang-format on
130}
131
132TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200133 std::vector<Token> tokens{Token("Hělló", 0, 5),
134 Token("fěěbař@google.com", 6, 23),
135 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100136
137 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
138
139 // clang-format off
140 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200141 {Token("Hě", 0, 2),
142 Token("lló", 2, 5),
143 Token("fěě", 6, 9),
144 Token("bař@google.com", 9, 23),
145 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100146 // clang-format on
147}
148
149TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100150 FeatureProcessorOptionsT options;
151 options.only_use_line_with_click = true;
152 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
153 TestingFeatureProcessor feature_processor(
154 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka726b4d22017-12-13 16:37:03 +0100155
Matt Sharifibe876dc2017-03-17 17:02:43 +0100156 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
157 const CodepointSpan span = {0, 5};
158 // clang-format off
159 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
160 Token("Lině", 6, 10),
161 Token("Sěcond", 11, 17),
162 Token("Lině", 18, 22),
163 Token("Thiřd", 23, 28),
164 Token("Lině", 29, 33)};
165 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100166
167 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100168 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100169 EXPECT_THAT(tokens,
170 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100171}
172
173TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100174 FeatureProcessorOptionsT options;
175 options.only_use_line_with_click = true;
176 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
177 TestingFeatureProcessor feature_processor(
178 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka726b4d22017-12-13 16:37:03 +0100179
Matt Sharifibe876dc2017-03-17 17:02:43 +0100180 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
181 const CodepointSpan span = {18, 22};
182 // clang-format off
183 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
184 Token("Lině", 6, 10),
185 Token("Sěcond", 11, 17),
186 Token("Lině", 18, 22),
187 Token("Thiřd", 23, 28),
188 Token("Lině", 29, 33)};
189 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100190
Matt Sharifibe876dc2017-03-17 17:02:43 +0100191 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100192 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100193 EXPECT_THAT(tokens, ElementsAreArray(
194 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100195}
196
197TEST(FeatureProcessorTest, KeepLineWithClickThird) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100198 FeatureProcessorOptionsT options;
199 options.only_use_line_with_click = true;
200 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
201 TestingFeatureProcessor feature_processor(
202 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka726b4d22017-12-13 16:37:03 +0100203
Matt Sharifibe876dc2017-03-17 17:02:43 +0100204 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
205 const CodepointSpan span = {24, 33};
206 // clang-format off
207 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
208 Token("Lině", 6, 10),
209 Token("Sěcond", 11, 17),
210 Token("Lině", 18, 22),
211 Token("Thiřd", 23, 28),
212 Token("Lině", 29, 33)};
213 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100214
Matt Sharifibe876dc2017-03-17 17:02:43 +0100215 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100216 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100217 EXPECT_THAT(tokens, ElementsAreArray(
218 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100219}
220
221TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100222 FeatureProcessorOptionsT options;
223 options.only_use_line_with_click = true;
224 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
225 TestingFeatureProcessor feature_processor(
226 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka726b4d22017-12-13 16:37:03 +0100227
Matt Sharifibe876dc2017-03-17 17:02:43 +0100228 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
229 const CodepointSpan span = {18, 22};
230 // clang-format off
231 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
232 Token("Lině", 6, 10),
233 Token("Sěcond", 11, 17),
234 Token("Lině", 18, 22),
235 Token("Thiřd", 23, 28),
236 Token("Lině", 29, 33)};
237 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100238
Matt Sharifibe876dc2017-03-17 17:02:43 +0100239 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100240 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100241 EXPECT_THAT(tokens, ElementsAreArray(
242 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100243}
244
245TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100246 FeatureProcessorOptionsT options;
247 options.only_use_line_with_click = true;
248 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
249 TestingFeatureProcessor feature_processor(
250 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka726b4d22017-12-13 16:37:03 +0100251
Matt Sharifibe876dc2017-03-17 17:02:43 +0100252 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
253 const CodepointSpan span = {5, 23};
254 // clang-format off
255 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
256 Token("Lině", 6, 10),
257 Token("Sěcond", 18, 23),
258 Token("Lině", 19, 23),
259 Token("Thiřd", 23, 28),
260 Token("Lině", 29, 33)};
261 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100262
Matt Sharifibe876dc2017-03-17 17:02:43 +0100263 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100264 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100265 EXPECT_THAT(tokens, ElementsAreArray(
266 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
267 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
268 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100269}
270
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200271TEST(FeatureProcessorTest, SpanToLabel) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100272 FeatureProcessorOptionsT options;
273 options.context_size = 1;
274 options.max_selection_span = 1;
275 options.snap_label_span_boundaries_to_containing_tokens = false;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200276
Lukas Zilka21d8c982018-01-24 11:11:20 +0100277 options.tokenization_codepoint_config.emplace_back(
278 new TokenizationCodepointRangeT());
279 auto& config = options.tokenization_codepoint_config.back();
280 config->start = 32;
281 config->end = 33;
282 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200283
Lukas Zilka21d8c982018-01-24 11:11:20 +0100284 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
285 TestingFeatureProcessor feature_processor(
286 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200287 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
288 ASSERT_EQ(3, tokens.size());
289 int label;
290 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
291 EXPECT_EQ(kInvalidLabel, label);
292 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
293 EXPECT_NE(kInvalidLabel, label);
294 TokenSpan token_span;
295 feature_processor.LabelToTokenSpan(label, &token_span);
296 EXPECT_EQ(0, token_span.first);
297 EXPECT_EQ(0, token_span.second);
298
299 // Reconfigure with snapping enabled.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100300 options.snap_label_span_boundaries_to_containing_tokens = true;
301 flatbuffers::DetachedBuffer options2_fb =
302 PackFeatureProcessorOptions(options);
303 TestingFeatureProcessor feature_processor2(
304 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()));
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200305 int label2;
306 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
307 EXPECT_EQ(label, label2);
308 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
309 EXPECT_EQ(label, label2);
310 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
311 EXPECT_EQ(label, label2);
312
313 // Cross a token boundary.
314 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
315 EXPECT_EQ(kInvalidLabel, label2);
316 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
317 EXPECT_EQ(kInvalidLabel, label2);
318
319 // Multiple tokens.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100320 options.context_size = 2;
321 options.max_selection_span = 2;
322 flatbuffers::DetachedBuffer options3_fb =
323 PackFeatureProcessorOptions(options);
324 TestingFeatureProcessor feature_processor3(
325 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()));
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200326 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
327 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
328 EXPECT_NE(kInvalidLabel, label2);
329 feature_processor3.LabelToTokenSpan(label2, &token_span);
330 EXPECT_EQ(1, token_span.first);
331 EXPECT_EQ(0, token_span.second);
332
333 int label3;
334 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
335 EXPECT_EQ(label2, label3);
336 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
337 EXPECT_EQ(label2, label3);
338 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
339 EXPECT_EQ(label2, label3);
340}
341
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200342TEST(FeatureProcessorTest, SpanToLabelIgnoresPunctuation) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100343 FeatureProcessorOptionsT options;
344 options.context_size = 1;
345 options.max_selection_span = 1;
346 options.snap_label_span_boundaries_to_containing_tokens = false;
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200347
Lukas Zilka21d8c982018-01-24 11:11:20 +0100348 options.tokenization_codepoint_config.emplace_back(
349 new TokenizationCodepointRangeT());
350 auto& config = options.tokenization_codepoint_config.back();
351 config->start = 32;
352 config->end = 33;
353 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200354
Lukas Zilka21d8c982018-01-24 11:11:20 +0100355 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
356 TestingFeatureProcessor feature_processor(
357 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200358 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
359 ASSERT_EQ(3, tokens.size());
360 int label;
361 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
362 EXPECT_EQ(kInvalidLabel, label);
363 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
364 EXPECT_NE(kInvalidLabel, label);
365 TokenSpan token_span;
366 feature_processor.LabelToTokenSpan(label, &token_span);
367 EXPECT_EQ(0, token_span.first);
368 EXPECT_EQ(0, token_span.second);
369
370 // Reconfigure with snapping enabled.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100371 options.snap_label_span_boundaries_to_containing_tokens = true;
372 flatbuffers::DetachedBuffer options2_fb =
373 PackFeatureProcessorOptions(options);
374 TestingFeatureProcessor feature_processor2(
375 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()));
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200376 int label2;
377 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
378 EXPECT_EQ(label, label2);
379 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
380 EXPECT_EQ(label, label2);
381 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
382 EXPECT_EQ(label, label2);
383
384 // Cross a token boundary.
385 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
386 EXPECT_EQ(kInvalidLabel, label2);
387 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
388 EXPECT_EQ(kInvalidLabel, label2);
389
390 // Multiple tokens.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100391 options.context_size = 2;
392 options.max_selection_span = 2;
393 flatbuffers::DetachedBuffer options3_fb =
394 PackFeatureProcessorOptions(options);
395 TestingFeatureProcessor feature_processor3(
396 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()));
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200397 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
398 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
399 EXPECT_NE(kInvalidLabel, label2);
400 feature_processor3.LabelToTokenSpan(label2, &token_span);
401 EXPECT_EQ(1, token_span.first);
402 EXPECT_EQ(0, token_span.second);
403
404 int label3;
405 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
406 EXPECT_EQ(label2, label3);
407 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
408 EXPECT_EQ(label2, label3);
409 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
410 EXPECT_EQ(label2, label3);
411}
412
Matt Sharifibe876dc2017-03-17 17:02:43 +0100413TEST(FeatureProcessorTest, CenterTokenFromClick) {
414 int token_index;
415
416 // Exactly aligned indices.
417 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200418 {6, 11},
419 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100420 EXPECT_EQ(token_index, 1);
421
422 // Click is contained in a token.
423 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200424 {13, 17},
425 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100426 EXPECT_EQ(token_index, 2);
427
428 // Click spans two tokens.
429 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200430 {6, 17},
431 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100432 EXPECT_EQ(token_index, kInvalidIndex);
433}
434
435TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100436 int token_index;
437
438 // Selection of length 3. Exactly aligned indices.
439 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200440 {7, 27},
441 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
442 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100443 EXPECT_EQ(token_index, 2);
444
445 // Selection of length 1 token. Exactly aligned indices.
446 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200447 {21, 27},
448 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
449 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100450 EXPECT_EQ(token_index, 3);
451
452 // Selection marks sub-token range, with no tokens in it.
453 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200454 {29, 33},
455 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
456 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100457 EXPECT_EQ(token_index, kInvalidIndex);
458
459 // Selection of length 2. Sub-token indices.
460 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200461 {3, 25},
462 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
463 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100464 EXPECT_EQ(token_index, 1);
465
466 // Selection of length 1. Sub-token indices.
467 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200468 {22, 34},
469 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
470 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100471 EXPECT_EQ(token_index, 4);
Alex Salcianu9087f1f2017-03-22 21:22:39 -0400472
473 // Some invalid ones.
474 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
475 EXPECT_EQ(token_index, -1);
476}
477
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200478TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100479 FeatureProcessorOptionsT options;
480 options.context_size = 2;
481 options.max_selection_span = 2;
482 options.snap_label_span_boundaries_to_containing_tokens = false;
483 options.feature_version = 2;
484 options.embedding_size = 4;
485 options.bounds_sensitive_features.reset(
486 new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
487 options.bounds_sensitive_features->enabled = true;
488 options.bounds_sensitive_features->num_tokens_before = 5;
489 options.bounds_sensitive_features->num_tokens_inside_left = 3;
490 options.bounds_sensitive_features->num_tokens_inside_right = 3;
491 options.bounds_sensitive_features->num_tokens_after = 5;
492 options.bounds_sensitive_features->include_inside_bag = true;
493 options.bounds_sensitive_features->include_inside_length = true;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200494
Lukas Zilka21d8c982018-01-24 11:11:20 +0100495 options.tokenization_codepoint_config.emplace_back(
496 new TokenizationCodepointRangeT());
497 auto& config = options.tokenization_codepoint_config.back();
498 config->start = 32;
499 config->end = 33;
500 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200501
Lukas Zilka21d8c982018-01-24 11:11:20 +0100502 {
503 options.supported_codepoint_ranges.emplace_back(
504 new FeatureProcessorOptions_::CodepointRangeT());
505 auto& range = options.supported_codepoint_ranges.back();
506 range->start = 0;
507 range->end = 128;
508 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200509
Lukas Zilka21d8c982018-01-24 11:11:20 +0100510 {
511 options.supported_codepoint_ranges.emplace_back(
512 new FeatureProcessorOptions_::CodepointRangeT());
513 auto& range = options.supported_codepoint_ranges.back();
514 range->start = 10000;
515 range->end = 10001;
516 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200517
Lukas Zilka21d8c982018-01-24 11:11:20 +0100518 {
519 options.supported_codepoint_ranges.emplace_back(
520 new FeatureProcessorOptions_::CodepointRangeT());
521 auto& range = options.supported_codepoint_ranges.back();
522 range->start = 20000;
523 range->end = 30000;
524 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200525
Lukas Zilka21d8c982018-01-24 11:11:20 +0100526 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
527 TestingFeatureProcessor feature_processor(
528 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200529 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100530 {0, 3}, feature_processor.Tokenize("aaa bbb ccc")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200531 FloatEq(1.0));
532 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100533 {0, 3}, feature_processor.Tokenize("aaa bbb ěěě")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200534 FloatEq(2.0 / 3));
535 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100536 {0, 3}, feature_processor.Tokenize("ěěě řřř ěěě")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200537 FloatEq(0.0));
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200538 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
539 -1, feature_processor.supported_codepoint_ranges_));
540 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
541 0, feature_processor.supported_codepoint_ranges_));
542 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
543 10, feature_processor.supported_codepoint_ranges_));
544 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
545 127, feature_processor.supported_codepoint_ranges_));
546 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
547 128, feature_processor.supported_codepoint_ranges_));
548 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
549 9999, feature_processor.supported_codepoint_ranges_));
550 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
551 10000, feature_processor.supported_codepoint_ranges_));
552 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
553 10001, feature_processor.supported_codepoint_ranges_));
554 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
555 25000, feature_processor.supported_codepoint_ranges_));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200556
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200557 std::unique_ptr<CachedFeatures> cached_features;
558
Lukas Zilka21d8c982018-01-24 11:11:20 +0100559 FakeEmbeddingExecutor embedding_executor;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200560
Lukas Zilka21d8c982018-01-24 11:11:20 +0100561 const std::vector<Token> tokens = {Token("ěěě", 0, 3), Token("řřř", 4, 7),
562 Token("eee", 8, 11)};
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200563
Lukas Zilka21d8c982018-01-24 11:11:20 +0100564 options.min_supported_codepoint_ratio = 0.0;
565 flatbuffers::DetachedBuffer options2_fb =
566 PackFeatureProcessorOptions(options);
567 TestingFeatureProcessor feature_processor2(
568 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()));
569 EXPECT_TRUE(feature_processor2.ExtractFeatures(
570 tokens, {0, 3}, &embedding_executor,
571 /*feature_vector_size=*/4, &cached_features));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200572
Lukas Zilka21d8c982018-01-24 11:11:20 +0100573 options.min_supported_codepoint_ratio = 0.2;
574 flatbuffers::DetachedBuffer options3_fb =
575 PackFeatureProcessorOptions(options);
576 TestingFeatureProcessor feature_processor3(
577 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()));
578 EXPECT_TRUE(feature_processor3.ExtractFeatures(
579 tokens, {0, 3}, &embedding_executor,
580 /*feature_vector_size=*/4, &cached_features));
581
582 options.min_supported_codepoint_ratio = 0.5;
583 flatbuffers::DetachedBuffer options4_fb =
584 PackFeatureProcessorOptions(options);
585 TestingFeatureProcessor feature_processor4(
586 flatbuffers::GetRoot<FeatureProcessorOptions>(options4_fb.data()));
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200587 EXPECT_FALSE(feature_processor4.ExtractFeatures(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100588 tokens, {0, 3}, &embedding_executor,
589 /*feature_vector_size=*/4, &cached_features));
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200590}
591
592TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
593 std::vector<Token> tokens_orig{
594 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
595 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
596 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
597 Token("12", 0, 0)};
598
599 std::vector<Token> tokens;
600 int click_index;
601
602 // Try to click first token and see if it gets padded from left.
603 tokens = tokens_orig;
604 click_index = 0;
605 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
606 // clang-format off
607 EXPECT_EQ(tokens, std::vector<Token>({Token(),
608 Token(),
609 Token("0", 0, 0),
610 Token("1", 0, 0),
611 Token("2", 0, 0)}));
612 // clang-format on
613 EXPECT_EQ(click_index, 2);
614
615 // When we click the second token nothing should get padded.
616 tokens = tokens_orig;
617 click_index = 2;
618 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
619 // clang-format off
620 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
621 Token("1", 0, 0),
622 Token("2", 0, 0),
623 Token("3", 0, 0),
624 Token("4", 0, 0)}));
625 // clang-format on
626 EXPECT_EQ(click_index, 2);
627
628 // When we click the last token tokens should get padded from the right.
629 tokens = tokens_orig;
630 click_index = 12;
631 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
632 // clang-format off
633 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
634 Token("11", 0, 0),
635 Token("12", 0, 0),
636 Token(),
637 Token()}));
638 // clang-format on
639 EXPECT_EQ(click_index, 2);
640}
641
642TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
643 std::vector<Token> tokens_orig{
644 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
645 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
646 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
647 Token("12", 0, 0)};
648
649 std::vector<Token> tokens;
650 int click_index;
651
652 // Try to click first token and see if it gets padded from left to maximum
653 // context_size.
654 tokens = tokens_orig;
655 click_index = 0;
656 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
657 // clang-format off
658 EXPECT_EQ(tokens, std::vector<Token>({Token(),
659 Token(),
660 Token("0", 0, 0),
661 Token("1", 0, 0),
662 Token("2", 0, 0),
663 Token("3", 0, 0),
664 Token("4", 0, 0),
665 Token("5", 0, 0)}));
666 // clang-format on
667 EXPECT_EQ(click_index, 2);
668
669 // Clicking to the middle with enough context should not produce any padding.
670 tokens = tokens_orig;
671 click_index = 6;
672 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
673 // clang-format off
674 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
675 Token("2", 0, 0),
676 Token("3", 0, 0),
677 Token("4", 0, 0),
678 Token("5", 0, 0),
679 Token("6", 0, 0),
680 Token("7", 0, 0),
681 Token("8", 0, 0),
682 Token("9", 0, 0)}));
683 // clang-format on
684 EXPECT_EQ(click_index, 5);
685
686 // Clicking at the end should pad right to maximum context_size.
687 tokens = tokens_orig;
688 click_index = 11;
689 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
690 // clang-format off
691 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
692 Token("7", 0, 0),
693 Token("8", 0, 0),
694 Token("9", 0, 0),
695 Token("10", 0, 0),
696 Token("11", 0, 0),
697 Token("12", 0, 0),
698 Token(),
699 Token()}));
700 // clang-format on
701 EXPECT_EQ(click_index, 5);
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200702}
703
Lukas Zilka21d8c982018-01-24 11:11:20 +0100704TEST(FeatureProcessorTest, InternalTokenizeOnScriptChange) {
705 FeatureProcessorOptionsT options;
706 options.tokenization_codepoint_config.emplace_back(
707 new TokenizationCodepointRangeT());
708 {
709 auto& config = options.tokenization_codepoint_config.back();
710 config->start = 0;
711 config->end = 256;
712 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
713 config->script_id = 1;
714 }
715 options.tokenize_on_script_change = false;
Lukas Zilka40c18de2017-04-10 17:22:22 +0200716
Lukas Zilka21d8c982018-01-24 11:11:20 +0100717 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
718 TestingFeatureProcessor feature_processor(
719 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
720
721 EXPECT_EQ(feature_processor.Tokenize("앨라배마123웹사이트"),
722 std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
723
724 options.tokenize_on_script_change = true;
725 flatbuffers::DetachedBuffer options_fb2 =
726 PackFeatureProcessorOptions(options);
727 TestingFeatureProcessor feature_processor2(
728 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb2.data()));
729
730 EXPECT_EQ(feature_processor2.Tokenize("앨라배마123웹사이트"),
731 std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
732 Token("웹사이트", 7, 11)}));
733}
734
735#ifdef LIBTEXTCLASSIFIER_TEST_ICU
736TEST(FeatureProcessorTest, ICUTokenize) {
737 FeatureProcessorOptionsT options;
738 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
739
740 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
741 TestingFeatureProcessor feature_processor(
742 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka40c18de2017-04-10 17:22:22 +0200743 std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
744 ASSERT_EQ(tokens,
745 // clang-format off
746 std::vector<Token>({Token("พระบาท", 0, 6),
747 Token("สมเด็จ", 6, 12),
748 Token("พระ", 12, 15),
749 Token("ปร", 15, 17),
750 Token("มิ", 17, 19)}));
751 // clang-format on
752}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100753#endif
Lukas Zilka40c18de2017-04-10 17:22:22 +0200754
Lukas Zilka21d8c982018-01-24 11:11:20 +0100755#ifdef LIBTEXTCLASSIFIER_TEST_ICU
Lukas Zilka40c18de2017-04-10 17:22:22 +0200756TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100757 FeatureProcessorOptionsT options;
758 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
759 options.icu_preserve_whitespace_tokens = true;
Lukas Zilka40c18de2017-04-10 17:22:22 +0200760
Lukas Zilka21d8c982018-01-24 11:11:20 +0100761 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
762 TestingFeatureProcessor feature_processor(
763 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilka40c18de2017-04-10 17:22:22 +0200764 std::vector<Token> tokens =
765 feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
766 ASSERT_EQ(tokens,
767 // clang-format off
768 std::vector<Token>({Token("พระบาท", 0, 6),
769 Token(" ", 6, 7),
770 Token("สมเด็จ", 7, 13),
771 Token(" ", 13, 14),
772 Token("พระ", 14, 17),
773 Token(" ", 17, 18),
774 Token("ปร", 18, 20),
775 Token(" ", 20, 21),
776 Token("มิ", 21, 23)}));
777 // clang-format on
778}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100779#endif
Lukas Zilka40c18de2017-04-10 17:22:22 +0200780
Lukas Zilka21d8c982018-01-24 11:11:20 +0100781#ifdef LIBTEXTCLASSIFIER_TEST_ICU
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200782TEST(FeatureProcessorTest, MixedTokenize) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100783 FeatureProcessorOptionsT options;
784 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_MIXED;
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200785
Lukas Zilka21d8c982018-01-24 11:11:20 +0100786 options.tokenization_codepoint_config.emplace_back(
787 new TokenizationCodepointRangeT());
788 auto& config = options.tokenization_codepoint_config.back();
789 config->start = 32;
790 config->end = 33;
791 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200792
Lukas Zilka21d8c982018-01-24 11:11:20 +0100793 {
794 options.internal_tokenizer_codepoint_ranges.emplace_back(
795 new FeatureProcessorOptions_::CodepointRangeT());
796 auto& range = options.internal_tokenizer_codepoint_ranges.back();
797 range->start = 0;
798 range->end = 128;
799 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200800
Lukas Zilka21d8c982018-01-24 11:11:20 +0100801 {
802 options.internal_tokenizer_codepoint_ranges.emplace_back(
803 new FeatureProcessorOptions_::CodepointRangeT());
804 auto& range = options.internal_tokenizer_codepoint_ranges.back();
805 range->start = 128;
806 range->end = 256;
807 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200808
Lukas Zilka21d8c982018-01-24 11:11:20 +0100809 {
810 options.internal_tokenizer_codepoint_ranges.emplace_back(
811 new FeatureProcessorOptions_::CodepointRangeT());
812 auto& range = options.internal_tokenizer_codepoint_ranges.back();
813 range->start = 256;
814 range->end = 384;
815 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200816
Lukas Zilka21d8c982018-01-24 11:11:20 +0100817 {
818 options.internal_tokenizer_codepoint_ranges.emplace_back(
819 new FeatureProcessorOptions_::CodepointRangeT());
820 auto& range = options.internal_tokenizer_codepoint_ranges.back();
821 range->start = 384;
822 range->end = 592;
823 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200824
Lukas Zilka21d8c982018-01-24 11:11:20 +0100825 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
826 TestingFeatureProcessor feature_processor(
827 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200828 std::vector<Token> tokens = feature_processor.Tokenize(
829 "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
830 ASSERT_EQ(tokens,
831 // clang-format off
832 std::vector<Token>({Token("こんにちは", 0, 5),
833 Token("Japanese-ląnguagę", 5, 22),
834 Token("text", 23, 27),
835 Token("世界", 28, 30),
836 Token("http://www.google.com/", 31, 53)}));
837 // clang-format on
838}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100839#endif
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200840
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200841TEST(FeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100842 FeatureProcessorOptionsT options;
843 options.ignored_span_boundary_codepoints.push_back('.');
844 options.ignored_span_boundary_codepoints.push_back(',');
845 options.ignored_span_boundary_codepoints.push_back('[');
846 options.ignored_span_boundary_codepoints.push_back(']');
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200847
Lukas Zilka21d8c982018-01-24 11:11:20 +0100848 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
849 TestingFeatureProcessor feature_processor(
850 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200851
852 const std::string text1_utf8 = "ěščř";
853 const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);
854 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
855 text1.begin(), text1.end(),
856 /*count_from_beginning=*/true),
857 0);
858 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
859 text1.begin(), text1.end(),
860 /*count_from_beginning=*/false),
861 0);
862
863 const std::string text2_utf8 = ".,abčd";
864 const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);
865 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
866 text2.begin(), text2.end(),
867 /*count_from_beginning=*/true),
868 2);
869 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
870 text2.begin(), text2.end(),
871 /*count_from_beginning=*/false),
872 0);
873
874 const std::string text3_utf8 = ".,abčd[]";
875 const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);
876 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
877 text3.begin(), text3.end(),
878 /*count_from_beginning=*/true),
879 2);
880 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
881 text3.begin(), text3.end(),
882 /*count_from_beginning=*/false),
883 2);
884
885 const std::string text4_utf8 = "[abčd]";
886 const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);
887 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
888 text4.begin(), text4.end(),
889 /*count_from_beginning=*/true),
890 1);
891 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
892 text4.begin(), text4.end(),
893 /*count_from_beginning=*/false),
894 1);
895
896 const std::string text5_utf8 = "";
897 const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);
898 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
899 text5.begin(), text5.end(),
900 /*count_from_beginning=*/true),
901 0);
902 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
903 text5.begin(), text5.end(),
904 /*count_from_beginning=*/false),
905 0);
906
907 const std::string text6_utf8 = "012345ěščř";
908 const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);
909 UnicodeText::const_iterator text6_begin = text6.begin();
910 std::advance(text6_begin, 6);
911 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
912 text6_begin, text6.end(),
913 /*count_from_beginning=*/true),
914 0);
915 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
916 text6_begin, text6.end(),
917 /*count_from_beginning=*/false),
918 0);
919
920 const std::string text7_utf8 = "012345.,ěščř";
921 const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);
922 UnicodeText::const_iterator text7_begin = text7.begin();
923 std::advance(text7_begin, 6);
924 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
925 text7_begin, text7.end(),
926 /*count_from_beginning=*/true),
927 2);
928 UnicodeText::const_iterator text7_end = text7.begin();
929 std::advance(text7_end, 8);
930 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
931 text7.begin(), text7_end,
932 /*count_from_beginning=*/false),
933 2);
934
935 // Test not stripping.
936 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
937 "Hello [[[Wořld]] or not?", {0, 24}),
938 std::make_pair(0, 24));
939 // Test basic stripping.
940 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
941 "Hello [[[Wořld]] or not?", {6, 16}),
942 std::make_pair(9, 14));
943 // Test stripping when everything is stripped.
944 EXPECT_EQ(
945 feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),
946 std::make_pair(6, 6));
947 // Test stripping empty string.
948 EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),
949 std::make_pair(0, 0));
950}
951
Lukas Zilka726b4d22017-12-13 16:37:03 +0100952TEST(FeatureProcessorTest, CodepointSpanToTokenSpan) {
953 const std::vector<Token> tokens{Token("Hělló", 0, 5),
954 Token("fěěbař@google.com", 6, 23),
955 Token("heře!", 24, 29)};
956
957 // Spans matching the tokens exactly.
958 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));
959 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));
960 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));
961 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));
962 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));
963 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));
964
965 // Snapping to containing tokens has no effect.
966 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));
967 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));
968 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));
969 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));
970 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));
971 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));
972
973 // Span boundaries inside tokens.
974 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));
975 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));
976
977 // Tokens adjacent to the span, but not overlapping.
978 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));
979 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));
980}
981
Matt Sharifid40f9762017-03-14 21:24:23 +0100982} // namespace
Lukas Zilka21d8c982018-01-24 11:11:20 +0100983} // namespace libtextclassifier2