Import libtextclassifier code and models.
This fixes a problem where models were not unmapped.
Also includes updated model files from some Tier-1 languages and
improved tokenization for cjt.
Switch back to dynamic linking of protobuf-lite library.
Bug: 37446398
Bug: 36886059
Test: Unit tests pass.
Change-Id: I5f9e8747918f49d8f1f7c65f3b8a6610141795df
diff --git a/tests/feature-processor_test.cc b/tests/feature-processor_test.cc
index cf09f96..4e27afc 100644
--- a/tests/feature-processor_test.cc
+++ b/tests/feature-processor_test.cc
@@ -203,8 +203,9 @@
using FeatureProcessor::FeatureProcessor;
using FeatureProcessor::SpanToLabel;
using FeatureProcessor::SupportedCodepointsRatio;
- using FeatureProcessor::IsCodepointSupported;
+ using FeatureProcessor::IsCodepointInRanges;
using FeatureProcessor::ICUTokenize;
+ using FeatureProcessor::supported_codepoint_ranges_;
};
TEST(FeatureProcessorTest, SpanToLabel) {
@@ -369,15 +370,24 @@
EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
1, feature_processor.Tokenize("ěěě řřř ěěě")),
FloatEq(0.0));
- EXPECT_FALSE(feature_processor.IsCodepointSupported(-1));
- EXPECT_TRUE(feature_processor.IsCodepointSupported(0));
- EXPECT_TRUE(feature_processor.IsCodepointSupported(10));
- EXPECT_TRUE(feature_processor.IsCodepointSupported(127));
- EXPECT_FALSE(feature_processor.IsCodepointSupported(128));
- EXPECT_FALSE(feature_processor.IsCodepointSupported(9999));
- EXPECT_TRUE(feature_processor.IsCodepointSupported(10000));
- EXPECT_FALSE(feature_processor.IsCodepointSupported(10001));
- EXPECT_TRUE(feature_processor.IsCodepointSupported(25000));
+ EXPECT_FALSE(feature_processor.IsCodepointInRanges(
+ -1, feature_processor.supported_codepoint_ranges_));
+ EXPECT_TRUE(feature_processor.IsCodepointInRanges(
+ 0, feature_processor.supported_codepoint_ranges_));
+ EXPECT_TRUE(feature_processor.IsCodepointInRanges(
+ 10, feature_processor.supported_codepoint_ranges_));
+ EXPECT_TRUE(feature_processor.IsCodepointInRanges(
+ 127, feature_processor.supported_codepoint_ranges_));
+ EXPECT_FALSE(feature_processor.IsCodepointInRanges(
+ 128, feature_processor.supported_codepoint_ranges_));
+ EXPECT_FALSE(feature_processor.IsCodepointInRanges(
+ 9999, feature_processor.supported_codepoint_ranges_));
+ EXPECT_TRUE(feature_processor.IsCodepointInRanges(
+ 10000, feature_processor.supported_codepoint_ranges_));
+ EXPECT_FALSE(feature_processor.IsCodepointInRanges(
+ 10001, feature_processor.supported_codepoint_ranges_));
+ EXPECT_TRUE(feature_processor.IsCodepointInRanges(
+ 25000, feature_processor.supported_codepoint_ranges_));
std::vector<Token> tokens;
int click_pos;
@@ -559,5 +569,46 @@
// clang-format on
}
+TEST(FeatureProcessorTest, MixedTokenize) {
+ FeatureProcessorOptions options;
+ options.set_tokenization_type(
+ libtextclassifier::FeatureProcessorOptions::MIXED);
+
+ TokenizationCodepointRange* config =
+ options.add_tokenization_codepoint_config();
+ config->set_start(32);
+ config->set_end(33);
+ config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
+
+ FeatureProcessorOptions::CodepointRange* range;
+ range = options.add_internal_tokenizer_codepoint_ranges();
+ range->set_start(0);
+ range->set_end(128);
+
+ range = options.add_internal_tokenizer_codepoint_ranges();
+ range->set_start(128);
+ range->set_end(256);
+
+ range = options.add_internal_tokenizer_codepoint_ranges();
+ range->set_start(256);
+ range->set_end(384);
+
+ range = options.add_internal_tokenizer_codepoint_ranges();
+ range->set_start(384);
+ range->set_end(592);
+
+ TestingFeatureProcessor feature_processor(options);
+ std::vector<Token> tokens = feature_processor.Tokenize(
+ "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
+ ASSERT_EQ(tokens,
+ // clang-format off
+ std::vector<Token>({Token("こんにちは", 0, 5),
+ Token("Japanese-ląnguagę", 5, 22),
+ Token("text", 23, 27),
+ Token("世界", 28, 30),
+ Token("http://www.google.com/", 31, 53)}));
+ // clang-format on
+}
+
} // namespace
} // namespace libtextclassifier