Adds support for ICU tokenization.

Bug: 36886053

Test: Built, tested on device, google3 regression and unit tests pass.
Change-Id: Ia3345c6c7a5aa816233d3b3ae10e2a92b31f08a7
diff --git a/tests/feature-processor_test.cc b/tests/feature-processor_test.cc
index e3a39e3..cf09f96 100644
--- a/tests/feature-processor_test.cc
+++ b/tests/feature-processor_test.cc
@@ -204,13 +204,13 @@
   using FeatureProcessor::SpanToLabel;
   using FeatureProcessor::SupportedCodepointsRatio;
   using FeatureProcessor::IsCodepointSupported;
+  using FeatureProcessor::ICUTokenize;
 };
 
 TEST(FeatureProcessorTest, SpanToLabel) {
   FeatureProcessorOptions options;
   options.set_context_size(1);
   options.set_max_selection_span(1);
-  options.set_tokenize_on_space(true);
   options.set_snap_label_span_boundaries_to_containing_tokens(false);
 
   TokenizationCodepointRange* config =
@@ -519,5 +519,45 @@
   EXPECT_EQ(click_index, 5);
 }
 
+TEST(FeatureProcessorTest, ICUTokenize) {
+  FeatureProcessorOptions options;
+  options.set_tokenization_type(
+      libtextclassifier::FeatureProcessorOptions::ICU);
+
+  TestingFeatureProcessor feature_processor(options);
+  std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
+  ASSERT_EQ(tokens,
+            // clang-format off
+            std::vector<Token>({Token("พระบาท", 0, 6),
+                                Token("สมเด็จ", 6, 12),
+                                Token("พระ", 12, 15),
+                                Token("ปร", 15, 17),
+                                Token("มิ", 17, 19)}));
+  // clang-format on
+}
+
+TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
+  FeatureProcessorOptions options;
+  options.set_tokenization_type(
+      libtextclassifier::FeatureProcessorOptions::ICU);
+  options.set_icu_preserve_whitespace_tokens(true);
+
+  TestingFeatureProcessor feature_processor(options);
+  std::vector<Token> tokens =
+      feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
+  ASSERT_EQ(tokens,
+            // clang-format off
+            std::vector<Token>({Token("พระบาท", 0, 6),
+                                Token(" ", 6, 7),
+                                Token("สมเด็จ", 7, 13),
+                                Token(" ", 13, 14),
+                                Token("พระ", 14, 17),
+                                Token(" ", 17, 18),
+                                Token("ปร", 18, 20),
+                                Token(" ", 20, 21),
+                                Token("มิ", 21, 23)}));
+  // clang-format on
+}
+
 }  // namespace
 }  // namespace libtextclassifier