Adds functionality and a model that contains regex for emails and urls. (G3 sync) Test: Builds, tested on device. Bug: 68296108 Change-Id: Id954f71d4f4b74f4bb81e3242265154f23cfbc0f

commit: 726b4d2f45a9de7289b936e9889efc7b914da3ee [log] [tgz]
author: Lukas Zilka <zilka@google.com> Wed Dec 13 16:37:03 2017 +0100
committer: Lukas Zilka <zilka@google.com> Thu Jan 11 08:27:01 2018 +0000
tree: dc37aa952e6f0d2ee64ea93b74bd4eef2f481eb0
parent: 1c19ec4fd68299425481c2dd4f41448073d746e7 [diff] [blame]
diff --git a/smartselect/feature-processor.h b/smartselect/feature-processor.h
index a39a789..ef9a3df 100644
--- a/smartselect/feature-processor.h
+++ b/smartselect/feature-processor.h

@@ -53,11 +53,6 @@
 TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
     const FeatureProcessorOptions& options);
 
-// Removes tokens that are not part of a line of the context which contains
-// given span.
-void StripTokensFromOtherLines(const std::string& context, CodepointSpan span,
-                               std::vector<Token>* tokens);
-
 // Splits tokens that contain the selection boundary inside them.
 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
 void SplitTokensOnSelectionBoundaries(CodepointSpan selection,
@@ -81,8 +76,12 @@
 }  // namespace internal
 
 // Converts a codepoint span to a token span in the given list of tokens.
-TokenSpan CodepointSpanToTokenSpan(const std::vector<Token>& selectable_tokens,
-                                   CodepointSpan codepoint_span);
+// If snap_boundaries_to_containing_tokens is set to true, it is enough for a
+// token to overlap with the codepoint range to be considered part of it.
+// Otherwise it must be fully included in the range.
+TokenSpan CodepointSpanToTokenSpan(
+    const std::vector<Token>& selectable_tokens, CodepointSpan codepoint_span,
+    bool snap_boundaries_to_containing_tokens = false);
 
 // Converts a token span to a codepoint span in the given list of tokens.
 CodepointSpan TokenSpanToCodepointSpan(
@@ -139,8 +138,8 @@
 
   // Extracts features as a CachedFeatures object that can be used for repeated
   // inference over token spans in the given context.
-  // When relative_click_span == {kInvalidIndex, kInvalidIndex} then all tokens
-  // extracted from context will be considered.
+  // When input_span == {kInvalidIndex, kInvalidIndex} then, relative_click_span
+  // is ignored, and all tokens extracted from context will be considered.
   bool ExtractFeatures(const std::string& context, CodepointSpan input_span,
                        TokenSpan relative_click_span,
                        const FeatureVectorFn& feature_vector_fn,
@@ -159,6 +158,10 @@
     return feature_extractor_.DenseFeaturesCount();
   }
 
+  // Splits context to several segments according to configuration.
+  std::vector<UnicodeTextRange> SplitContext(
+      const UnicodeText& context_unicode) const;
+
   // Strips boundary codepoints from the span in context and returns the new
   // start and end indices. If the span comprises entirely of boundary
   // codepoints, the first index of span is returned for both indices.
@@ -249,6 +252,11 @@
   void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span,
                          std::vector<Token>* result) const;
 
+  // Removes all tokens from tokens that are not on a line (defined by calling
+  // SplitContext on the context) to which span points.
+  void StripTokensFromOtherLines(const std::string& context, CodepointSpan span,
+                                 std::vector<Token>* tokens) const;
+
   const TokenFeatureExtractor feature_extractor_;
 
   // Codepoint ranges that define what codepoints are supported by the model.
commit	726b4d2f45a9de7289b936e9889efc7b914da3ee	[log] [tgz]
author	Lukas Zilka <zilka@google.com>	Wed Dec 13 16:37:03 2017 +0100
committer	Lukas Zilka <zilka@google.com>	Thu Jan 11 08:27:01 2018 +0000
tree	dc37aa952e6f0d2ee64ea93b74bd4eef2f481eb0
parent	1c19ec4fd68299425481c2dd4f41448073d746e7 [diff] [blame]