Import libtextclassifier code and models. This fixes a problem where models were not unmapped. Also includes updated model files from some Tier-1 languages and improved tokenization for cjt. Switch back to dynamic linking of protobuf-lite library. Bug: 37446398 Bug: 36886059 Test: Unit tests pass. Change-Id: I5f9e8747918f49d8f1f7c65f3b8a6610141795df

commit: f95c3bde7ea64b3e951a41b1bb21a58a6d916c99 [log] [tgz]
author: Matt Sharifi <mns@google.com> Tue Apr 25 18:41:11 2017 +0200
committer: Matt Sharifi <mns@google.com> Wed Apr 26 20:25:46 2017 +0200
tree: 409a10ac45099433a5c5dbd1f546bfab9a709cc1
parent: 44742a4b41edae592b539b6a7890a2ea3f861211 [diff] [blame]
diff --git a/tests/feature-processor_test.cc b/tests/feature-processor_test.cc
index cf09f96..4e27afc 100644
--- a/tests/feature-processor_test.cc
+++ b/tests/feature-processor_test.cc

@@ -203,8 +203,9 @@
   using FeatureProcessor::FeatureProcessor;
   using FeatureProcessor::SpanToLabel;
   using FeatureProcessor::SupportedCodepointsRatio;
-  using FeatureProcessor::IsCodepointSupported;
+  using FeatureProcessor::IsCodepointInRanges;
   using FeatureProcessor::ICUTokenize;
+  using FeatureProcessor::supported_codepoint_ranges_;
 };
 
 TEST(FeatureProcessorTest, SpanToLabel) {
@@ -369,15 +370,24 @@
   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
                   1, feature_processor.Tokenize("ěěě řřř ěěě")),
               FloatEq(0.0));
-  EXPECT_FALSE(feature_processor.IsCodepointSupported(-1));
-  EXPECT_TRUE(feature_processor.IsCodepointSupported(0));
-  EXPECT_TRUE(feature_processor.IsCodepointSupported(10));
-  EXPECT_TRUE(feature_processor.IsCodepointSupported(127));
-  EXPECT_FALSE(feature_processor.IsCodepointSupported(128));
-  EXPECT_FALSE(feature_processor.IsCodepointSupported(9999));
-  EXPECT_TRUE(feature_processor.IsCodepointSupported(10000));
-  EXPECT_FALSE(feature_processor.IsCodepointSupported(10001));
-  EXPECT_TRUE(feature_processor.IsCodepointSupported(25000));
+  EXPECT_FALSE(feature_processor.IsCodepointInRanges(
+      -1, feature_processor.supported_codepoint_ranges_));
+  EXPECT_TRUE(feature_processor.IsCodepointInRanges(
+      0, feature_processor.supported_codepoint_ranges_));
+  EXPECT_TRUE(feature_processor.IsCodepointInRanges(
+      10, feature_processor.supported_codepoint_ranges_));
+  EXPECT_TRUE(feature_processor.IsCodepointInRanges(
+      127, feature_processor.supported_codepoint_ranges_));
+  EXPECT_FALSE(feature_processor.IsCodepointInRanges(
+      128, feature_processor.supported_codepoint_ranges_));
+  EXPECT_FALSE(feature_processor.IsCodepointInRanges(
+      9999, feature_processor.supported_codepoint_ranges_));
+  EXPECT_TRUE(feature_processor.IsCodepointInRanges(
+      10000, feature_processor.supported_codepoint_ranges_));
+  EXPECT_FALSE(feature_processor.IsCodepointInRanges(
+      10001, feature_processor.supported_codepoint_ranges_));
+  EXPECT_TRUE(feature_processor.IsCodepointInRanges(
+      25000, feature_processor.supported_codepoint_ranges_));
 
   std::vector<Token> tokens;
   int click_pos;
@@ -559,5 +569,46 @@
   // clang-format on
 }
 
+TEST(FeatureProcessorTest, MixedTokenize) {
+  FeatureProcessorOptions options;
+  options.set_tokenization_type(
+      libtextclassifier::FeatureProcessorOptions::MIXED);
+
+  TokenizationCodepointRange* config =
+      options.add_tokenization_codepoint_config();
+  config->set_start(32);
+  config->set_end(33);
+  config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
+
+  FeatureProcessorOptions::CodepointRange* range;
+  range = options.add_internal_tokenizer_codepoint_ranges();
+  range->set_start(0);
+  range->set_end(128);
+
+  range = options.add_internal_tokenizer_codepoint_ranges();
+  range->set_start(128);
+  range->set_end(256);
+
+  range = options.add_internal_tokenizer_codepoint_ranges();
+  range->set_start(256);
+  range->set_end(384);
+
+  range = options.add_internal_tokenizer_codepoint_ranges();
+  range->set_start(384);
+  range->set_end(592);
+
+  TestingFeatureProcessor feature_processor(options);
+  std::vector<Token> tokens = feature_processor.Tokenize(
+      "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
+  ASSERT_EQ(tokens,
+            // clang-format off
+            std::vector<Token>({Token("こんにちは", 0, 5),
+                                Token("Japanese-ląnguagę", 5, 22),
+                                Token("text", 23, 27),
+                                Token("世界", 28, 30),
+                                Token("http://www.google.com/", 31, 53)}));
+  // clang-format on
+}
+
 }  // namespace
 }  // namespace libtextclassifier
commit	f95c3bde7ea64b3e951a41b1bb21a58a6d916c99	[log] [tgz]
author	Matt Sharifi <mns@google.com>	Tue Apr 25 18:41:11 2017 +0200
committer	Matt Sharifi <mns@google.com>	Wed Apr 26 20:25:46 2017 +0200
tree	409a10ac45099433a5c5dbd1f546bfab9a709cc1
parent	44742a4b41edae592b539b6a7890a2ea3f861211 [diff] [blame]