Adds functionality and a model that contains regex for emails and urls. (G3 sync)

Test: Builds, tested on device.
Bug: 68296108
Change-Id: Id954f71d4f4b74f4bb81e3242265154f23cfbc0f
diff --git a/smartselect/feature-processor_test.cc b/smartselect/feature-processor_test.cc
index 1a9b9da..9bee67a 100644
--- a/smartselect/feature-processor_test.cc
+++ b/smartselect/feature-processor_test.cc
@@ -25,6 +25,18 @@
 using testing::ElementsAreArray;
 using testing::FloatEq;
 
+class TestingFeatureProcessor : public FeatureProcessor {
+ public:
+  using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
+  using FeatureProcessor::FeatureProcessor;
+  using FeatureProcessor::ICUTokenize;
+  using FeatureProcessor::IsCodepointInRanges;
+  using FeatureProcessor::SpanToLabel;
+  using FeatureProcessor::StripTokensFromOtherLines;
+  using FeatureProcessor::supported_codepoint_ranges_;
+  using FeatureProcessor::SupportedCodepointsRatio;
+};
+
 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
   std::vector<Token> tokens{Token("Hělló", 0, 5),
                             Token("fěěbař@google.com", 6, 23),
@@ -107,6 +119,10 @@
 }
 
 TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
+  FeatureProcessorOptions options;
+  options.set_only_use_line_with_click(true);
+  TestingFeatureProcessor feature_processor(options);
+
   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
   const CodepointSpan span = {0, 5};
   // clang-format off
@@ -119,12 +135,16 @@
   // clang-format on
 
   // Keeps the first line.
-  internal::StripTokensFromOtherLines(context, span, &tokens);
+  feature_processor.StripTokensFromOtherLines(context, span, &tokens);
   EXPECT_THAT(tokens,
               ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
 }
 
 TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
+  FeatureProcessorOptions options;
+  options.set_only_use_line_with_click(true);
+  TestingFeatureProcessor feature_processor(options);
+
   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
   const CodepointSpan span = {18, 22};
   // clang-format off
@@ -137,12 +157,16 @@
   // clang-format on
 
   // Keeps the first line.
-  internal::StripTokensFromOtherLines(context, span, &tokens);
+  feature_processor.StripTokensFromOtherLines(context, span, &tokens);
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
 }
 
 TEST(FeatureProcessorTest, KeepLineWithClickThird) {
+  FeatureProcessorOptions options;
+  options.set_only_use_line_with_click(true);
+  TestingFeatureProcessor feature_processor(options);
+
   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
   const CodepointSpan span = {24, 33};
   // clang-format off
@@ -155,12 +179,16 @@
   // clang-format on
 
   // Keeps the first line.
-  internal::StripTokensFromOtherLines(context, span, &tokens);
+  feature_processor.StripTokensFromOtherLines(context, span, &tokens);
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
 }
 
 TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
+  FeatureProcessorOptions options;
+  options.set_only_use_line_with_click(true);
+  TestingFeatureProcessor feature_processor(options);
+
   const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
   const CodepointSpan span = {18, 22};
   // clang-format off
@@ -173,12 +201,16 @@
   // clang-format on
 
   // Keeps the first line.
-  internal::StripTokensFromOtherLines(context, span, &tokens);
+  feature_processor.StripTokensFromOtherLines(context, span, &tokens);
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
 }
 
 TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
+  FeatureProcessorOptions options;
+  options.set_only_use_line_with_click(true);
+  TestingFeatureProcessor feature_processor(options);
+
   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
   const CodepointSpan span = {5, 23};
   // clang-format off
@@ -191,24 +223,13 @@
   // clang-format on
 
   // Keeps the first line.
-  internal::StripTokensFromOtherLines(context, span, &tokens);
+  feature_processor.StripTokensFromOtherLines(context, span, &tokens);
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
                            Token("Sěcond", 18, 23), Token("Lině", 19, 23),
                            Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
 }
 
-class TestingFeatureProcessor : public FeatureProcessor {
- public:
-  using FeatureProcessor::FeatureProcessor;
-  using FeatureProcessor::SpanToLabel;
-  using FeatureProcessor::SupportedCodepointsRatio;
-  using FeatureProcessor::IsCodepointInRanges;
-  using FeatureProcessor::ICUTokenize;
-  using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
-  using FeatureProcessor::supported_codepoint_ranges_;
-};
-
 TEST(FeatureProcessorTest, SpanToLabel) {
   FeatureProcessorOptions options;
   options.set_context_size(1);
@@ -782,5 +803,35 @@
             std::make_pair(0, 0));
 }
 
+TEST(FeatureProcessorTest, CodepointSpanToTokenSpan) {
+  const std::vector<Token> tokens{Token("Hělló", 0, 5),
+                                  Token("fěěbař@google.com", 6, 23),
+                                  Token("heře!", 24, 29)};
+
+  // Spans matching the tokens exactly.
+  EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));
+  EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));
+  EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));
+  EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));
+  EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));
+  EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));
+
+  // Snapping to containing tokens has no effect.
+  EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));
+  EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));
+  EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));
+  EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));
+  EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));
+  EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));
+
+  // Span boundaries inside tokens.
+  EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));
+  EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));
+
+  // Tokens adjacent to the span, but not overlapping.
+  EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));
+  EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));
+}
+
 }  // namespace
 }  // namespace libtextclassifier