blob: c9f0e0df1f7407cdc830de174953079f9462c1f2 [file] [log] [blame]
Matt Sharifid40f9762017-03-14 21:24:23 +01001/*
Tony Mak6c4cc672018-09-17 11:48:50 +01002 * Copyright (C) 2018 The Android Open Source Project
Matt Sharifid40f9762017-03-14 21:24:23 +01003 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Tony Mak6c4cc672018-09-17 11:48:50 +010017#include "annotator/feature-processor.h"
Lukas Zilka21d8c982018-01-24 11:11:20 +010018
Tony Mak6c4cc672018-09-17 11:48:50 +010019#include "annotator/model-executor.h"
20#include "utils/tensor-view.h"
Matt Sharifid40f9762017-03-14 21:24:23 +010021
22#include "gmock/gmock.h"
23#include "gtest/gtest.h"
24
Tony Mak6c4cc672018-09-17 11:48:50 +010025namespace libtextclassifier3 {
Matt Sharifid40f9762017-03-14 21:24:23 +010026namespace {
27
28using testing::ElementsAreArray;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +020029using testing::FloatEq;
Lukas Zilkaba849e72018-03-08 14:48:21 +010030using testing::Matcher;
Matt Sharifid40f9762017-03-14 21:24:23 +010031
Lukas Zilka21d8c982018-01-24 11:11:20 +010032flatbuffers::DetachedBuffer PackFeatureProcessorOptions(
33 const FeatureProcessorOptionsT& options) {
34 flatbuffers::FlatBufferBuilder builder;
35 builder.Finish(CreateFeatureProcessorOptions(builder, &options));
36 return builder.Release();
37}
38
Lukas Zilkaba849e72018-03-08 14:48:21 +010039template <typename T>
40std::vector<T> Subvector(const std::vector<T>& vector, int start, int end) {
41 return std::vector<T>(vector.begin() + start, vector.begin() + end);
42}
43
44Matcher<std::vector<float>> ElementsAreFloat(const std::vector<float>& values) {
45 std::vector<Matcher<float>> matchers;
46 for (const float value : values) {
47 matchers.push_back(FloatEq(value));
48 }
49 return ElementsAreArray(matchers);
50}
51
Lukas Zilka726b4d22017-12-13 16:37:03 +010052class TestingFeatureProcessor : public FeatureProcessor {
53 public:
54 using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
55 using FeatureProcessor::FeatureProcessor;
56 using FeatureProcessor::ICUTokenize;
57 using FeatureProcessor::IsCodepointInRanges;
58 using FeatureProcessor::SpanToLabel;
59 using FeatureProcessor::StripTokensFromOtherLines;
60 using FeatureProcessor::supported_codepoint_ranges_;
61 using FeatureProcessor::SupportedCodepointsRatio;
62};
63
Lukas Zilka21d8c982018-01-24 11:11:20 +010064// EmbeddingExecutor that always returns features based on
65class FakeEmbeddingExecutor : public EmbeddingExecutor {
66 public:
67 bool AddEmbedding(const TensorView<int>& sparse_features, float* dest,
Lukas Zilkaba849e72018-03-08 14:48:21 +010068 int dest_size) const override {
Tony Mak6c4cc672018-09-17 11:48:50 +010069 TC3_CHECK_GE(dest_size, 4);
Lukas Zilka21d8c982018-01-24 11:11:20 +010070 EXPECT_EQ(sparse_features.size(), 1);
71 dest[0] = sparse_features.data()[0];
72 dest[1] = sparse_features.data()[0];
73 dest[2] = -sparse_features.data()[0];
74 dest[3] = -sparse_features.data()[0];
75 return true;
76 }
77
78 private:
79 std::vector<float> storage_;
80};
81
Tony Mak6c4cc672018-09-17 11:48:50 +010082class FeatureProcessorTest : public ::testing::Test {
83 protected:
84 FeatureProcessorTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
85 UniLib unilib_;
86};
87
88TEST_F(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +020089 std::vector<Token> tokens{Token("Hělló", 0, 5),
90 Token("fěěbař@google.com", 6, 23),
91 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +010092
93 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
94
95 // clang-format off
96 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +020097 {Token("Hělló", 0, 5),
98 Token("fěě", 6, 9),
99 Token("bař", 9, 12),
100 Token("@google.com", 12, 23),
101 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100102 // clang-format on
103}
104
Tony Mak6c4cc672018-09-17 11:48:50 +0100105TEST_F(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200106 std::vector<Token> tokens{Token("Hělló", 0, 5),
107 Token("fěěbař@google.com", 6, 23),
108 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100109
110 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
111
112 // clang-format off
113 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200114 {Token("Hělló", 0, 5),
115 Token("fěěbař", 6, 12),
116 Token("@google.com", 12, 23),
117 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100118 // clang-format on
119}
120
Tony Mak6c4cc672018-09-17 11:48:50 +0100121TEST_F(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200122 std::vector<Token> tokens{Token("Hělló", 0, 5),
123 Token("fěěbař@google.com", 6, 23),
124 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100125
126 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
127
128 // clang-format off
129 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200130 {Token("Hělló", 0, 5),
131 Token("fěě", 6, 9),
132 Token("bař@google.com", 9, 23),
133 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100134 // clang-format on
135}
136
Tony Mak6c4cc672018-09-17 11:48:50 +0100137TEST_F(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200138 std::vector<Token> tokens{Token("Hělló", 0, 5),
139 Token("fěěbař@google.com", 6, 23),
140 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100141
142 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
143
144 // clang-format off
145 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200146 {Token("Hělló", 0, 5),
147 Token("fěěbař@google.com", 6, 23),
148 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100149 // clang-format on
150}
151
Tony Mak6c4cc672018-09-17 11:48:50 +0100152TEST_F(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200153 std::vector<Token> tokens{Token("Hělló", 0, 5),
154 Token("fěěbař@google.com", 6, 23),
155 Token("heře!", 24, 29)};
Matt Sharifid40f9762017-03-14 21:24:23 +0100156
157 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
158
159 // clang-format off
160 EXPECT_THAT(tokens, ElementsAreArray(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200161 {Token("Hě", 0, 2),
162 Token("lló", 2, 5),
163 Token("fěě", 6, 9),
164 Token("bař@google.com", 9, 23),
165 Token("heře!", 24, 29)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100166 // clang-format on
167}
168
Tony Mak6c4cc672018-09-17 11:48:50 +0100169TEST_F(FeatureProcessorTest, KeepLineWithClickFirst) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100170 FeatureProcessorOptionsT options;
171 options.only_use_line_with_click = true;
172 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
173 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100174 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100175 &unilib_);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100176
Matt Sharifibe876dc2017-03-17 17:02:43 +0100177 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
178 const CodepointSpan span = {0, 5};
179 // clang-format off
180 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
181 Token("Lině", 6, 10),
182 Token("Sěcond", 11, 17),
183 Token("Lině", 18, 22),
184 Token("Thiřd", 23, 28),
185 Token("Lině", 29, 33)};
186 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100187
188 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100189 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100190 EXPECT_THAT(tokens,
191 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100192}
193
Tony Mak6c4cc672018-09-17 11:48:50 +0100194TEST_F(FeatureProcessorTest, KeepLineWithClickSecond) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100195 FeatureProcessorOptionsT options;
196 options.only_use_line_with_click = true;
197 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
198 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100199 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100200 &unilib_);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100201
Matt Sharifibe876dc2017-03-17 17:02:43 +0100202 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
203 const CodepointSpan span = {18, 22};
204 // clang-format off
205 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
206 Token("Lině", 6, 10),
207 Token("Sěcond", 11, 17),
208 Token("Lině", 18, 22),
209 Token("Thiřd", 23, 28),
210 Token("Lině", 29, 33)};
211 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100212
Matt Sharifibe876dc2017-03-17 17:02:43 +0100213 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100214 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100215 EXPECT_THAT(tokens, ElementsAreArray(
216 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100217}
218
Tony Mak6c4cc672018-09-17 11:48:50 +0100219TEST_F(FeatureProcessorTest, KeepLineWithClickThird) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100220 FeatureProcessorOptionsT options;
221 options.only_use_line_with_click = true;
222 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
223 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100224 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100225 &unilib_);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100226
Matt Sharifibe876dc2017-03-17 17:02:43 +0100227 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
228 const CodepointSpan span = {24, 33};
229 // clang-format off
230 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
231 Token("Lině", 6, 10),
232 Token("Sěcond", 11, 17),
233 Token("Lině", 18, 22),
234 Token("Thiřd", 23, 28),
235 Token("Lině", 29, 33)};
236 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100237
Matt Sharifibe876dc2017-03-17 17:02:43 +0100238 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100239 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100240 EXPECT_THAT(tokens, ElementsAreArray(
241 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100242}
243
Tony Mak6c4cc672018-09-17 11:48:50 +0100244TEST_F(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100245 FeatureProcessorOptionsT options;
246 options.only_use_line_with_click = true;
247 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
248 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100249 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100250 &unilib_);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100251
Matt Sharifibe876dc2017-03-17 17:02:43 +0100252 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
253 const CodepointSpan span = {18, 22};
254 // clang-format off
255 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
256 Token("Lině", 6, 10),
257 Token("Sěcond", 11, 17),
258 Token("Lině", 18, 22),
259 Token("Thiřd", 23, 28),
260 Token("Lině", 29, 33)};
261 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100262
Matt Sharifibe876dc2017-03-17 17:02:43 +0100263 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100264 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100265 EXPECT_THAT(tokens, ElementsAreArray(
266 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100267}
268
Tony Mak6c4cc672018-09-17 11:48:50 +0100269TEST_F(FeatureProcessorTest, KeepLineWithCrosslineClick) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100270 FeatureProcessorOptionsT options;
271 options.only_use_line_with_click = true;
272 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
273 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100274 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100275 &unilib_);
Lukas Zilka726b4d22017-12-13 16:37:03 +0100276
Matt Sharifibe876dc2017-03-17 17:02:43 +0100277 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
278 const CodepointSpan span = {5, 23};
279 // clang-format off
280 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
281 Token("Lině", 6, 10),
282 Token("Sěcond", 18, 23),
283 Token("Lině", 19, 23),
284 Token("Thiřd", 23, 28),
285 Token("Lině", 29, 33)};
286 // clang-format on
Matt Sharifid40f9762017-03-14 21:24:23 +0100287
Matt Sharifibe876dc2017-03-17 17:02:43 +0100288 // Keeps the first line.
Lukas Zilka726b4d22017-12-13 16:37:03 +0100289 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
Matt Sharifibe876dc2017-03-17 17:02:43 +0100290 EXPECT_THAT(tokens, ElementsAreArray(
291 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
292 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
293 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
Matt Sharifid40f9762017-03-14 21:24:23 +0100294}
295
Tony Mak6c4cc672018-09-17 11:48:50 +0100296TEST_F(FeatureProcessorTest, SpanToLabel) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100297 FeatureProcessorOptionsT options;
298 options.context_size = 1;
299 options.max_selection_span = 1;
300 options.snap_label_span_boundaries_to_containing_tokens = false;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200301
Lukas Zilka21d8c982018-01-24 11:11:20 +0100302 options.tokenization_codepoint_config.emplace_back(
303 new TokenizationCodepointRangeT());
304 auto& config = options.tokenization_codepoint_config.back();
305 config->start = 32;
306 config->end = 33;
307 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200308
Lukas Zilka21d8c982018-01-24 11:11:20 +0100309 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
310 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100311 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100312 &unilib_);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200313 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
314 ASSERT_EQ(3, tokens.size());
315 int label;
316 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
317 EXPECT_EQ(kInvalidLabel, label);
318 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
319 EXPECT_NE(kInvalidLabel, label);
320 TokenSpan token_span;
321 feature_processor.LabelToTokenSpan(label, &token_span);
322 EXPECT_EQ(0, token_span.first);
323 EXPECT_EQ(0, token_span.second);
324
325 // Reconfigure with snapping enabled.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100326 options.snap_label_span_boundaries_to_containing_tokens = true;
327 flatbuffers::DetachedBuffer options2_fb =
328 PackFeatureProcessorOptions(options);
329 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100330 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100331 &unilib_);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200332 int label2;
333 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
334 EXPECT_EQ(label, label2);
335 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
336 EXPECT_EQ(label, label2);
337 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
338 EXPECT_EQ(label, label2);
339
340 // Cross a token boundary.
341 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
342 EXPECT_EQ(kInvalidLabel, label2);
343 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
344 EXPECT_EQ(kInvalidLabel, label2);
345
346 // Multiple tokens.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100347 options.context_size = 2;
348 options.max_selection_span = 2;
349 flatbuffers::DetachedBuffer options3_fb =
350 PackFeatureProcessorOptions(options);
351 TestingFeatureProcessor feature_processor3(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100352 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100353 &unilib_);
Matt Sharifi0d68ef92017-03-27 14:20:21 +0200354 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
355 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
356 EXPECT_NE(kInvalidLabel, label2);
357 feature_processor3.LabelToTokenSpan(label2, &token_span);
358 EXPECT_EQ(1, token_span.first);
359 EXPECT_EQ(0, token_span.second);
360
361 int label3;
362 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
363 EXPECT_EQ(label2, label3);
364 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
365 EXPECT_EQ(label2, label3);
366 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
367 EXPECT_EQ(label2, label3);
368}
369
Tony Mak6c4cc672018-09-17 11:48:50 +0100370TEST_F(FeatureProcessorTest, SpanToLabelIgnoresPunctuation) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100371 FeatureProcessorOptionsT options;
372 options.context_size = 1;
373 options.max_selection_span = 1;
374 options.snap_label_span_boundaries_to_containing_tokens = false;
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200375
Lukas Zilka21d8c982018-01-24 11:11:20 +0100376 options.tokenization_codepoint_config.emplace_back(
377 new TokenizationCodepointRangeT());
378 auto& config = options.tokenization_codepoint_config.back();
379 config->start = 32;
380 config->end = 33;
381 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200382
Lukas Zilka21d8c982018-01-24 11:11:20 +0100383 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
384 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100385 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100386 &unilib_);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200387 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
388 ASSERT_EQ(3, tokens.size());
389 int label;
390 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
391 EXPECT_EQ(kInvalidLabel, label);
392 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
393 EXPECT_NE(kInvalidLabel, label);
394 TokenSpan token_span;
395 feature_processor.LabelToTokenSpan(label, &token_span);
396 EXPECT_EQ(0, token_span.first);
397 EXPECT_EQ(0, token_span.second);
398
399 // Reconfigure with snapping enabled.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100400 options.snap_label_span_boundaries_to_containing_tokens = true;
401 flatbuffers::DetachedBuffer options2_fb =
402 PackFeatureProcessorOptions(options);
403 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100404 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100405 &unilib_);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200406 int label2;
407 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
408 EXPECT_EQ(label, label2);
409 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
410 EXPECT_EQ(label, label2);
411 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
412 EXPECT_EQ(label, label2);
413
414 // Cross a token boundary.
415 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
416 EXPECT_EQ(kInvalidLabel, label2);
417 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
418 EXPECT_EQ(kInvalidLabel, label2);
419
420 // Multiple tokens.
Lukas Zilka21d8c982018-01-24 11:11:20 +0100421 options.context_size = 2;
422 options.max_selection_span = 2;
423 flatbuffers::DetachedBuffer options3_fb =
424 PackFeatureProcessorOptions(options);
425 TestingFeatureProcessor feature_processor3(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100426 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100427 &unilib_);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200428 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
429 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
430 EXPECT_NE(kInvalidLabel, label2);
431 feature_processor3.LabelToTokenSpan(label2, &token_span);
432 EXPECT_EQ(1, token_span.first);
433 EXPECT_EQ(0, token_span.second);
434
435 int label3;
436 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
437 EXPECT_EQ(label2, label3);
438 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
439 EXPECT_EQ(label2, label3);
440 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
441 EXPECT_EQ(label2, label3);
442}
443
Tony Mak6c4cc672018-09-17 11:48:50 +0100444TEST_F(FeatureProcessorTest, CenterTokenFromClick) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100445 int token_index;
446
447 // Exactly aligned indices.
448 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200449 {6, 11},
450 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100451 EXPECT_EQ(token_index, 1);
452
453 // Click is contained in a token.
454 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200455 {13, 17},
456 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100457 EXPECT_EQ(token_index, 2);
458
459 // Click spans two tokens.
460 token_index = internal::CenterTokenFromClick(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200461 {6, 17},
462 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100463 EXPECT_EQ(token_index, kInvalidIndex);
464}
465
Tony Mak6c4cc672018-09-17 11:48:50 +0100466TEST_F(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
Matt Sharifibe876dc2017-03-17 17:02:43 +0100467 int token_index;
468
469 // Selection of length 3. Exactly aligned indices.
470 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200471 {7, 27},
472 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
473 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100474 EXPECT_EQ(token_index, 2);
475
476 // Selection of length 1 token. Exactly aligned indices.
477 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200478 {21, 27},
479 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
480 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100481 EXPECT_EQ(token_index, 3);
482
483 // Selection marks sub-token range, with no tokens in it.
484 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200485 {29, 33},
486 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
487 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100488 EXPECT_EQ(token_index, kInvalidIndex);
489
490 // Selection of length 2. Sub-token indices.
491 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200492 {3, 25},
493 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
494 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100495 EXPECT_EQ(token_index, 1);
496
497 // Selection of length 1. Sub-token indices.
498 token_index = internal::CenterTokenFromMiddleOfSelection(
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200499 {22, 34},
500 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
501 Token("Token4", 21, 27), Token("Token5", 28, 34)});
Matt Sharifibe876dc2017-03-17 17:02:43 +0100502 EXPECT_EQ(token_index, 4);
Alex Salcianu9087f1f2017-03-22 21:22:39 -0400503
504 // Some invalid ones.
505 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
506 EXPECT_EQ(token_index, -1);
507}
508
Tony Mak6c4cc672018-09-17 11:48:50 +0100509TEST_F(FeatureProcessorTest, SupportedCodepointsRatio) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100510 FeatureProcessorOptionsT options;
511 options.context_size = 2;
512 options.max_selection_span = 2;
513 options.snap_label_span_boundaries_to_containing_tokens = false;
514 options.feature_version = 2;
515 options.embedding_size = 4;
516 options.bounds_sensitive_features.reset(
517 new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
518 options.bounds_sensitive_features->enabled = true;
519 options.bounds_sensitive_features->num_tokens_before = 5;
520 options.bounds_sensitive_features->num_tokens_inside_left = 3;
521 options.bounds_sensitive_features->num_tokens_inside_right = 3;
522 options.bounds_sensitive_features->num_tokens_after = 5;
523 options.bounds_sensitive_features->include_inside_bag = true;
524 options.bounds_sensitive_features->include_inside_length = true;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200525
Lukas Zilka21d8c982018-01-24 11:11:20 +0100526 options.tokenization_codepoint_config.emplace_back(
527 new TokenizationCodepointRangeT());
528 auto& config = options.tokenization_codepoint_config.back();
529 config->start = 32;
530 config->end = 33;
531 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200532
Lukas Zilka21d8c982018-01-24 11:11:20 +0100533 {
534 options.supported_codepoint_ranges.emplace_back(
535 new FeatureProcessorOptions_::CodepointRangeT());
536 auto& range = options.supported_codepoint_ranges.back();
537 range->start = 0;
538 range->end = 128;
539 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200540
Lukas Zilka21d8c982018-01-24 11:11:20 +0100541 {
542 options.supported_codepoint_ranges.emplace_back(
543 new FeatureProcessorOptions_::CodepointRangeT());
544 auto& range = options.supported_codepoint_ranges.back();
545 range->start = 10000;
546 range->end = 10001;
547 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200548
Lukas Zilka21d8c982018-01-24 11:11:20 +0100549 {
550 options.supported_codepoint_ranges.emplace_back(
551 new FeatureProcessorOptions_::CodepointRangeT());
552 auto& range = options.supported_codepoint_ranges.back();
553 range->start = 20000;
554 range->end = 30000;
555 }
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200556
Lukas Zilka21d8c982018-01-24 11:11:20 +0100557 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
558 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100559 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100560 &unilib_);
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200561 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100562 {0, 3}, feature_processor.Tokenize("aaa bbb ccc")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200563 FloatEq(1.0));
564 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100565 {0, 3}, feature_processor.Tokenize("aaa bbb ěěě")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200566 FloatEq(2.0 / 3));
567 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
Lukas Zilka21d8c982018-01-24 11:11:20 +0100568 {0, 3}, feature_processor.Tokenize("ěěě řřř ěěě")),
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200569 FloatEq(0.0));
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200570 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
571 -1, feature_processor.supported_codepoint_ranges_));
572 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
573 0, feature_processor.supported_codepoint_ranges_));
574 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
575 10, feature_processor.supported_codepoint_ranges_));
576 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
577 127, feature_processor.supported_codepoint_ranges_));
578 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
579 128, feature_processor.supported_codepoint_ranges_));
580 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
581 9999, feature_processor.supported_codepoint_ranges_));
582 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
583 10000, feature_processor.supported_codepoint_ranges_));
584 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
585 10001, feature_processor.supported_codepoint_ranges_));
586 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
587 25000, feature_processor.supported_codepoint_ranges_));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200588
Lukas Zilka21d8c982018-01-24 11:11:20 +0100589 const std::vector<Token> tokens = {Token("ěěě", 0, 3), Token("řřř", 4, 7),
590 Token("eee", 8, 11)};
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200591
Lukas Zilka21d8c982018-01-24 11:11:20 +0100592 options.min_supported_codepoint_ratio = 0.0;
593 flatbuffers::DetachedBuffer options2_fb =
594 PackFeatureProcessorOptions(options);
595 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100596 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100597 &unilib_);
Lukas Zilka434442d2018-04-25 11:38:51 +0200598 EXPECT_TRUE(feature_processor2.HasEnoughSupportedCodepoints(
599 tokens, /*token_span=*/{0, 3}));
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200600
Lukas Zilka21d8c982018-01-24 11:11:20 +0100601 options.min_supported_codepoint_ratio = 0.2;
602 flatbuffers::DetachedBuffer options3_fb =
603 PackFeatureProcessorOptions(options);
604 TestingFeatureProcessor feature_processor3(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100605 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100606 &unilib_);
Lukas Zilka434442d2018-04-25 11:38:51 +0200607 EXPECT_TRUE(feature_processor3.HasEnoughSupportedCodepoints(
608 tokens, /*token_span=*/{0, 3}));
Lukas Zilka21d8c982018-01-24 11:11:20 +0100609
610 options.min_supported_codepoint_ratio = 0.5;
611 flatbuffers::DetachedBuffer options4_fb =
612 PackFeatureProcessorOptions(options);
613 TestingFeatureProcessor feature_processor4(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100614 flatbuffers::GetRoot<FeatureProcessorOptions>(options4_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100615 &unilib_);
Lukas Zilka434442d2018-04-25 11:38:51 +0200616 EXPECT_FALSE(feature_processor4.HasEnoughSupportedCodepoints(
617 tokens, /*token_span=*/{0, 3}));
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200618}
619
Tony Mak6c4cc672018-09-17 11:48:50 +0100620TEST_F(FeatureProcessorTest, InSpanFeature) {
Lukas Zilkab23e2122018-02-09 10:25:19 +0100621 FeatureProcessorOptionsT options;
622 options.context_size = 2;
623 options.max_selection_span = 2;
624 options.snap_label_span_boundaries_to_containing_tokens = false;
625 options.feature_version = 2;
626 options.embedding_size = 4;
627 options.extract_selection_mask_feature = true;
628
629 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100630 TestingFeatureProcessor feature_processor(
631 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100632 &unilib_);
Lukas Zilkab23e2122018-02-09 10:25:19 +0100633
634 std::unique_ptr<CachedFeatures> cached_features;
635
636 FakeEmbeddingExecutor embedding_executor;
637
638 const std::vector<Token> tokens = {Token("aaa", 0, 3), Token("bbb", 4, 7),
639 Token("ccc", 8, 11), Token("ddd", 12, 15)};
640
641 EXPECT_TRUE(feature_processor.ExtractFeatures(
642 tokens, /*token_span=*/{0, 4},
643 /*selection_span_for_feature=*/{4, 11}, &embedding_executor,
Lukas Zilkaba849e72018-03-08 14:48:21 +0100644 /*embedding_cache=*/nullptr, /*feature_vector_size=*/5,
645 &cached_features));
Lukas Zilkab23e2122018-02-09 10:25:19 +0100646 std::vector<float> features;
647 cached_features->AppendClickContextFeaturesForClick(1, &features);
648 ASSERT_EQ(features.size(), 25);
649 EXPECT_THAT(features[4], FloatEq(0.0));
650 EXPECT_THAT(features[9], FloatEq(0.0));
651 EXPECT_THAT(features[14], FloatEq(1.0));
652 EXPECT_THAT(features[19], FloatEq(1.0));
653 EXPECT_THAT(features[24], FloatEq(0.0));
654}
655
Tony Mak6c4cc672018-09-17 11:48:50 +0100656TEST_F(FeatureProcessorTest, EmbeddingCache) {
Lukas Zilkaba849e72018-03-08 14:48:21 +0100657 FeatureProcessorOptionsT options;
658 options.context_size = 2;
659 options.max_selection_span = 2;
660 options.snap_label_span_boundaries_to_containing_tokens = false;
661 options.feature_version = 2;
662 options.embedding_size = 4;
663 options.bounds_sensitive_features.reset(
664 new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
665 options.bounds_sensitive_features->enabled = true;
666 options.bounds_sensitive_features->num_tokens_before = 3;
667 options.bounds_sensitive_features->num_tokens_inside_left = 2;
668 options.bounds_sensitive_features->num_tokens_inside_right = 2;
669 options.bounds_sensitive_features->num_tokens_after = 3;
670
671 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100672 TestingFeatureProcessor feature_processor(
673 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100674 &unilib_);
Lukas Zilkaba849e72018-03-08 14:48:21 +0100675
676 std::unique_ptr<CachedFeatures> cached_features;
677
678 FakeEmbeddingExecutor embedding_executor;
679
680 const std::vector<Token> tokens = {
681 Token("aaa", 0, 3), Token("bbb", 4, 7), Token("ccc", 8, 11),
682 Token("ddd", 12, 15), Token("eee", 16, 19), Token("fff", 20, 23)};
683
684 // We pre-populate the cache with dummy embeddings, to make sure they are
685 // used when populating the features vector.
686 const std::vector<float> cached_padding_features = {10.0, -10.0, 10.0, -10.0};
687 const std::vector<float> cached_features1 = {1.0, 2.0, 3.0, 4.0};
688 const std::vector<float> cached_features2 = {5.0, 6.0, 7.0, 8.0};
689 FeatureProcessor::EmbeddingCache embedding_cache = {
690 {{kInvalidIndex, kInvalidIndex}, cached_padding_features},
691 {{4, 7}, cached_features1},
692 {{12, 15}, cached_features2},
693 };
694
695 EXPECT_TRUE(feature_processor.ExtractFeatures(
696 tokens, /*token_span=*/{0, 6},
697 /*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
698 &embedding_executor, &embedding_cache, /*feature_vector_size=*/4,
699 &cached_features));
700 std::vector<float> features;
701 cached_features->AppendBoundsSensitiveFeaturesForSpan({2, 4}, &features);
702 ASSERT_EQ(features.size(), 40);
703 // Check that the dummy embeddings were used.
704 EXPECT_THAT(Subvector(features, 0, 4),
705 ElementsAreFloat(cached_padding_features));
706 EXPECT_THAT(Subvector(features, 8, 12), ElementsAreFloat(cached_features1));
707 EXPECT_THAT(Subvector(features, 16, 20), ElementsAreFloat(cached_features2));
708 EXPECT_THAT(Subvector(features, 24, 28), ElementsAreFloat(cached_features2));
709 EXPECT_THAT(Subvector(features, 36, 40),
710 ElementsAreFloat(cached_padding_features));
711 // Check that the real embeddings were cached.
712 EXPECT_EQ(embedding_cache.size(), 7);
713 EXPECT_THAT(Subvector(features, 4, 8),
714 ElementsAreFloat(embedding_cache.at({0, 3})));
715 EXPECT_THAT(Subvector(features, 12, 16),
716 ElementsAreFloat(embedding_cache.at({8, 11})));
717 EXPECT_THAT(Subvector(features, 20, 24),
718 ElementsAreFloat(embedding_cache.at({8, 11})));
719 EXPECT_THAT(Subvector(features, 28, 32),
720 ElementsAreFloat(embedding_cache.at({16, 19})));
721 EXPECT_THAT(Subvector(features, 32, 36),
722 ElementsAreFloat(embedding_cache.at({20, 23})));
723}
724
Tony Mak6c4cc672018-09-17 11:48:50 +0100725TEST_F(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200726 std::vector<Token> tokens_orig{
727 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
728 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
729 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
730 Token("12", 0, 0)};
731
732 std::vector<Token> tokens;
733 int click_index;
734
735 // Try to click first token and see if it gets padded from left.
736 tokens = tokens_orig;
737 click_index = 0;
738 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
739 // clang-format off
740 EXPECT_EQ(tokens, std::vector<Token>({Token(),
741 Token(),
742 Token("0", 0, 0),
743 Token("1", 0, 0),
744 Token("2", 0, 0)}));
745 // clang-format on
746 EXPECT_EQ(click_index, 2);
747
748 // When we click the second token nothing should get padded.
749 tokens = tokens_orig;
750 click_index = 2;
751 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
752 // clang-format off
753 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
754 Token("1", 0, 0),
755 Token("2", 0, 0),
756 Token("3", 0, 0),
757 Token("4", 0, 0)}));
758 // clang-format on
759 EXPECT_EQ(click_index, 2);
760
761 // When we click the last token tokens should get padded from the right.
762 tokens = tokens_orig;
763 click_index = 12;
764 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
765 // clang-format off
766 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
767 Token("11", 0, 0),
768 Token("12", 0, 0),
769 Token(),
770 Token()}));
771 // clang-format on
772 EXPECT_EQ(click_index, 2);
773}
774
Tony Mak6c4cc672018-09-17 11:48:50 +0100775TEST_F(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
Lukas Zilka6bb39a82017-04-07 19:55:11 +0200776 std::vector<Token> tokens_orig{
777 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
778 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
779 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
780 Token("12", 0, 0)};
781
782 std::vector<Token> tokens;
783 int click_index;
784
785 // Try to click first token and see if it gets padded from left to maximum
786 // context_size.
787 tokens = tokens_orig;
788 click_index = 0;
789 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
790 // clang-format off
791 EXPECT_EQ(tokens, std::vector<Token>({Token(),
792 Token(),
793 Token("0", 0, 0),
794 Token("1", 0, 0),
795 Token("2", 0, 0),
796 Token("3", 0, 0),
797 Token("4", 0, 0),
798 Token("5", 0, 0)}));
799 // clang-format on
800 EXPECT_EQ(click_index, 2);
801
802 // Clicking to the middle with enough context should not produce any padding.
803 tokens = tokens_orig;
804 click_index = 6;
805 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
806 // clang-format off
807 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
808 Token("2", 0, 0),
809 Token("3", 0, 0),
810 Token("4", 0, 0),
811 Token("5", 0, 0),
812 Token("6", 0, 0),
813 Token("7", 0, 0),
814 Token("8", 0, 0),
815 Token("9", 0, 0)}));
816 // clang-format on
817 EXPECT_EQ(click_index, 5);
818
819 // Clicking at the end should pad right to maximum context_size.
820 tokens = tokens_orig;
821 click_index = 11;
822 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
823 // clang-format off
824 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
825 Token("7", 0, 0),
826 Token("8", 0, 0),
827 Token("9", 0, 0),
828 Token("10", 0, 0),
829 Token("11", 0, 0),
830 Token("12", 0, 0),
831 Token(),
832 Token()}));
833 // clang-format on
834 EXPECT_EQ(click_index, 5);
Lukas Zilka26e8c2e2017-04-06 15:54:24 +0200835}
836
Tony Mak6c4cc672018-09-17 11:48:50 +0100837TEST_F(FeatureProcessorTest, InternalTokenizeOnScriptChange) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100838 FeatureProcessorOptionsT options;
839 options.tokenization_codepoint_config.emplace_back(
840 new TokenizationCodepointRangeT());
841 {
842 auto& config = options.tokenization_codepoint_config.back();
843 config->start = 0;
844 config->end = 256;
845 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
846 config->script_id = 1;
847 }
848 options.tokenize_on_script_change = false;
Lukas Zilka40c18de2017-04-10 17:22:22 +0200849
Lukas Zilka21d8c982018-01-24 11:11:20 +0100850 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
851 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100852 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100853 &unilib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100854
855 EXPECT_EQ(feature_processor.Tokenize("앨라배마123웹사이트"),
856 std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
857
858 options.tokenize_on_script_change = true;
859 flatbuffers::DetachedBuffer options_fb2 =
860 PackFeatureProcessorOptions(options);
861 TestingFeatureProcessor feature_processor2(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100862 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb2.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100863 &unilib_);
Lukas Zilka21d8c982018-01-24 11:11:20 +0100864
865 EXPECT_EQ(feature_processor2.Tokenize("앨라배마123웹사이트"),
866 std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
867 Token("웹사이트", 7, 11)}));
868}
869
Tony Maka0f598b2018-11-20 20:39:04 +0000870#ifdef TC3_TEST_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100871TEST_F(FeatureProcessorTest, ICUTokenize) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100872 FeatureProcessorOptionsT options;
873 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
874
875 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
Tony Mak6c4cc672018-09-17 11:48:50 +0100876 UniLib unilib;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100877 TestingFeatureProcessor feature_processor(
Tony Mak6c4cc672018-09-17 11:48:50 +0100878 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
879 &unilib);
Lukas Zilka40c18de2017-04-10 17:22:22 +0200880 std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
881 ASSERT_EQ(tokens,
882 // clang-format off
883 std::vector<Token>({Token("พระบาท", 0, 6),
884 Token("สมเด็จ", 6, 12),
885 Token("พระ", 12, 15),
886 Token("ปร", 15, 17),
887 Token("มิ", 17, 19)}));
888 // clang-format on
889}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100890#endif
Lukas Zilka40c18de2017-04-10 17:22:22 +0200891
Tony Maka0f598b2018-11-20 20:39:04 +0000892#ifdef TC3_TEST_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100893TEST_F(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100894 FeatureProcessorOptionsT options;
895 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
896 options.icu_preserve_whitespace_tokens = true;
Lukas Zilka40c18de2017-04-10 17:22:22 +0200897
Lukas Zilka21d8c982018-01-24 11:11:20 +0100898 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
Tony Mak6c4cc672018-09-17 11:48:50 +0100899 UniLib unilib;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100900 TestingFeatureProcessor feature_processor(
Tony Mak6c4cc672018-09-17 11:48:50 +0100901 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
902 &unilib);
Lukas Zilka40c18de2017-04-10 17:22:22 +0200903 std::vector<Token> tokens =
904 feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
905 ASSERT_EQ(tokens,
906 // clang-format off
907 std::vector<Token>({Token("พระบาท", 0, 6),
908 Token(" ", 6, 7),
909 Token("สมเด็จ", 7, 13),
910 Token(" ", 13, 14),
911 Token("พระ", 14, 17),
912 Token(" ", 17, 18),
913 Token("ปร", 18, 20),
914 Token(" ", 20, 21),
915 Token("มิ", 21, 23)}));
916 // clang-format on
917}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100918#endif
Lukas Zilka40c18de2017-04-10 17:22:22 +0200919
Tony Maka0f598b2018-11-20 20:39:04 +0000920#ifdef TC3_TEST_ICU
Tony Mak6c4cc672018-09-17 11:48:50 +0100921TEST_F(FeatureProcessorTest, MixedTokenize) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100922 FeatureProcessorOptionsT options;
923 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_MIXED;
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200924
Lukas Zilka21d8c982018-01-24 11:11:20 +0100925 options.tokenization_codepoint_config.emplace_back(
926 new TokenizationCodepointRangeT());
927 auto& config = options.tokenization_codepoint_config.back();
928 config->start = 32;
929 config->end = 33;
930 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200931
Lukas Zilka21d8c982018-01-24 11:11:20 +0100932 {
933 options.internal_tokenizer_codepoint_ranges.emplace_back(
934 new FeatureProcessorOptions_::CodepointRangeT());
935 auto& range = options.internal_tokenizer_codepoint_ranges.back();
936 range->start = 0;
937 range->end = 128;
938 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200939
Lukas Zilka21d8c982018-01-24 11:11:20 +0100940 {
941 options.internal_tokenizer_codepoint_ranges.emplace_back(
942 new FeatureProcessorOptions_::CodepointRangeT());
943 auto& range = options.internal_tokenizer_codepoint_ranges.back();
944 range->start = 128;
945 range->end = 256;
946 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200947
Lukas Zilka21d8c982018-01-24 11:11:20 +0100948 {
949 options.internal_tokenizer_codepoint_ranges.emplace_back(
950 new FeatureProcessorOptions_::CodepointRangeT());
951 auto& range = options.internal_tokenizer_codepoint_ranges.back();
952 range->start = 256;
953 range->end = 384;
954 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200955
Lukas Zilka21d8c982018-01-24 11:11:20 +0100956 {
957 options.internal_tokenizer_codepoint_ranges.emplace_back(
958 new FeatureProcessorOptions_::CodepointRangeT());
959 auto& range = options.internal_tokenizer_codepoint_ranges.back();
960 range->start = 384;
961 range->end = 592;
962 }
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200963
Lukas Zilka21d8c982018-01-24 11:11:20 +0100964 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
Tony Mak6c4cc672018-09-17 11:48:50 +0100965 UniLib unilib;
Lukas Zilka21d8c982018-01-24 11:11:20 +0100966 TestingFeatureProcessor feature_processor(
Tony Mak6c4cc672018-09-17 11:48:50 +0100967 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
968 &unilib);
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200969 std::vector<Token> tokens = feature_processor.Tokenize(
970 "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
971 ASSERT_EQ(tokens,
972 // clang-format off
973 std::vector<Token>({Token("こんにちは", 0, 5),
974 Token("Japanese-ląnguagę", 5, 22),
975 Token("text", 23, 27),
976 Token("世界", 28, 30),
977 Token("http://www.google.com/", 31, 53)}));
978 // clang-format on
979}
Lukas Zilka21d8c982018-01-24 11:11:20 +0100980#endif
Matt Sharifif95c3bd2017-04-25 18:41:11 +0200981
Tony Mak6c4cc672018-09-17 11:48:50 +0100982TEST_F(FeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {
Lukas Zilka21d8c982018-01-24 11:11:20 +0100983 FeatureProcessorOptionsT options;
984 options.ignored_span_boundary_codepoints.push_back('.');
985 options.ignored_span_boundary_codepoints.push_back(',');
986 options.ignored_span_boundary_codepoints.push_back('[');
987 options.ignored_span_boundary_codepoints.push_back(']');
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200988
Lukas Zilka21d8c982018-01-24 11:11:20 +0100989 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
990 TestingFeatureProcessor feature_processor(
Lukas Zilkab23e2122018-02-09 10:25:19 +0100991 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
Tony Mak6c4cc672018-09-17 11:48:50 +0100992 &unilib_);
Lukas Zilkae5ea2ab2017-10-11 10:50:05 +0200993
994 const std::string text1_utf8 = "ěščř";
995 const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);
996 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
997 text1.begin(), text1.end(),
998 /*count_from_beginning=*/true),
999 0);
1000 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1001 text1.begin(), text1.end(),
1002 /*count_from_beginning=*/false),
1003 0);
1004
1005 const std::string text2_utf8 = ".,abčd";
1006 const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);
1007 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1008 text2.begin(), text2.end(),
1009 /*count_from_beginning=*/true),
1010 2);
1011 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1012 text2.begin(), text2.end(),
1013 /*count_from_beginning=*/false),
1014 0);
1015
1016 const std::string text3_utf8 = ".,abčd[]";
1017 const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);
1018 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1019 text3.begin(), text3.end(),
1020 /*count_from_beginning=*/true),
1021 2);
1022 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1023 text3.begin(), text3.end(),
1024 /*count_from_beginning=*/false),
1025 2);
1026
1027 const std::string text4_utf8 = "[abčd]";
1028 const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);
1029 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1030 text4.begin(), text4.end(),
1031 /*count_from_beginning=*/true),
1032 1);
1033 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1034 text4.begin(), text4.end(),
1035 /*count_from_beginning=*/false),
1036 1);
1037
1038 const std::string text5_utf8 = "";
1039 const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);
1040 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1041 text5.begin(), text5.end(),
1042 /*count_from_beginning=*/true),
1043 0);
1044 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1045 text5.begin(), text5.end(),
1046 /*count_from_beginning=*/false),
1047 0);
1048
1049 const std::string text6_utf8 = "012345ěščř";
1050 const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);
1051 UnicodeText::const_iterator text6_begin = text6.begin();
1052 std::advance(text6_begin, 6);
1053 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1054 text6_begin, text6.end(),
1055 /*count_from_beginning=*/true),
1056 0);
1057 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1058 text6_begin, text6.end(),
1059 /*count_from_beginning=*/false),
1060 0);
1061
1062 const std::string text7_utf8 = "012345.,ěščř";
1063 const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);
1064 UnicodeText::const_iterator text7_begin = text7.begin();
1065 std::advance(text7_begin, 6);
1066 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1067 text7_begin, text7.end(),
1068 /*count_from_beginning=*/true),
1069 2);
1070 UnicodeText::const_iterator text7_end = text7.begin();
1071 std::advance(text7_end, 8);
1072 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1073 text7.begin(), text7_end,
1074 /*count_from_beginning=*/false),
1075 2);
1076
1077 // Test not stripping.
1078 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1079 "Hello [[[Wořld]] or not?", {0, 24}),
1080 std::make_pair(0, 24));
1081 // Test basic stripping.
1082 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1083 "Hello [[[Wořld]] or not?", {6, 16}),
1084 std::make_pair(9, 14));
1085 // Test stripping when everything is stripped.
1086 EXPECT_EQ(
1087 feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),
1088 std::make_pair(6, 6));
1089 // Test stripping empty string.
1090 EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),
1091 std::make_pair(0, 0));
1092}
1093
Tony Mak6c4cc672018-09-17 11:48:50 +01001094TEST_F(FeatureProcessorTest, CodepointSpanToTokenSpan) {
Lukas Zilka726b4d22017-12-13 16:37:03 +01001095 const std::vector<Token> tokens{Token("Hělló", 0, 5),
1096 Token("fěěbař@google.com", 6, 23),
1097 Token("heře!", 24, 29)};
1098
1099 // Spans matching the tokens exactly.
1100 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));
1101 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));
1102 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));
1103 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));
1104 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));
1105 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));
1106
1107 // Snapping to containing tokens has no effect.
1108 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));
1109 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));
1110 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));
1111 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));
1112 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));
1113 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));
1114
1115 // Span boundaries inside tokens.
1116 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));
1117 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));
1118
1119 // Tokens adjacent to the span, but not overlapping.
1120 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));
1121 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));
1122}
1123
Matt Sharifid40f9762017-03-14 21:24:23 +01001124} // namespace
Tony Mak6c4cc672018-09-17 11:48:50 +01001125} // namespace libtextclassifier3