Adds functionality and a model that contains regex for emails and urls. (G3 sync)
Test: Builds, tested on device.
Bug: 68296108
Change-Id: Id954f71d4f4b74f4bb81e3242265154f23cfbc0f
diff --git a/smartselect/feature-processor.h b/smartselect/feature-processor.h
index a39a789..ef9a3df 100644
--- a/smartselect/feature-processor.h
+++ b/smartselect/feature-processor.h
@@ -53,11 +53,6 @@
TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
const FeatureProcessorOptions& options);
-// Removes tokens that are not part of a line of the context which contains
-// given span.
-void StripTokensFromOtherLines(const std::string& context, CodepointSpan span,
- std::vector<Token>* tokens);
-
// Splits tokens that contain the selection boundary inside them.
// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
void SplitTokensOnSelectionBoundaries(CodepointSpan selection,
@@ -81,8 +76,12 @@
} // namespace internal
// Converts a codepoint span to a token span in the given list of tokens.
-TokenSpan CodepointSpanToTokenSpan(const std::vector<Token>& selectable_tokens,
- CodepointSpan codepoint_span);
+// If snap_boundaries_to_containing_tokens is set to true, it is enough for a
+// token to overlap with the codepoint range to be considered part of it.
+// Otherwise it must be fully included in the range.
+TokenSpan CodepointSpanToTokenSpan(
+ const std::vector<Token>& selectable_tokens, CodepointSpan codepoint_span,
+ bool snap_boundaries_to_containing_tokens = false);
// Converts a token span to a codepoint span in the given list of tokens.
CodepointSpan TokenSpanToCodepointSpan(
@@ -139,8 +138,8 @@
// Extracts features as a CachedFeatures object that can be used for repeated
// inference over token spans in the given context.
- // When relative_click_span == {kInvalidIndex, kInvalidIndex} then all tokens
- // extracted from context will be considered.
+ // When input_span == {kInvalidIndex, kInvalidIndex} then, relative_click_span
+ // is ignored, and all tokens extracted from context will be considered.
bool ExtractFeatures(const std::string& context, CodepointSpan input_span,
TokenSpan relative_click_span,
const FeatureVectorFn& feature_vector_fn,
@@ -159,6 +158,10 @@
return feature_extractor_.DenseFeaturesCount();
}
+ // Splits context to several segments according to configuration.
+ std::vector<UnicodeTextRange> SplitContext(
+ const UnicodeText& context_unicode) const;
+
// Strips boundary codepoints from the span in context and returns the new
// start and end indices. If the span comprises entirely of boundary
// codepoints, the first index of span is returned for both indices.
@@ -249,6 +252,11 @@
void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span,
std::vector<Token>* result) const;
+ // Removes all tokens from tokens that are not on a line (defined by calling
+ // SplitContext on the context) to which span points.
+ void StripTokensFromOtherLines(const std::string& context, CodepointSpan span,
+ std::vector<Token>* tokens) const;
+
const TokenFeatureExtractor feature_extractor_;
// Codepoint ranges that define what codepoints are supported by the model.