Sync from google3.

Bug: 68239358
Test: Builds. Tested on device. CTS test passes.

bit FrameworksCoreTests:android.view.textclassifier.TextClassificationManagerTest

Change-Id: Ie5e20b06b1c615ab246e7ed7f08e980e61c492c4
diff --git a/token-feature-extractor.cc b/token-feature-extractor.cc
index 33c4d75..e194179 100644
--- a/token-feature-extractor.cc
+++ b/token-feature-extractor.cc
@@ -74,10 +74,88 @@
     : options_(options), unilib_(unilib) {
   for (const std::string& pattern : options.regexp_features) {
     regex_patterns_.push_back(std::unique_ptr<UniLib::RegexPattern>(
-        unilib_.CreateRegexPattern(pattern)));
+        unilib_.CreateRegexPattern(UTF8ToUnicodeText(
+            pattern.c_str(), pattern.size(), /*do_copy=*/false))));
   }
 }
 
+bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span,
+                                    std::vector<int>* sparse_features,
+                                    std::vector<float>* dense_features) const {
+  if (sparse_features == nullptr || dense_features == nullptr) {
+    return false;
+  }
+  *sparse_features = ExtractCharactergramFeatures(token);
+  *dense_features = ExtractDenseFeatures(token, is_in_span);
+  return true;
+}
+
+std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures(
+    const Token& token) const {
+  if (options_.unicode_aware_features) {
+    return ExtractCharactergramFeaturesUnicode(token);
+  } else {
+    return ExtractCharactergramFeaturesAscii(token);
+  }
+}
+
+std::vector<float> TokenFeatureExtractor::ExtractDenseFeatures(
+    const Token& token, bool is_in_span) const {
+  std::vector<float> dense_features;
+
+  if (options_.extract_case_feature) {
+    if (options_.unicode_aware_features) {
+      UnicodeText token_unicode =
+          UTF8ToUnicodeText(token.value, /*do_copy=*/false);
+      const bool is_upper = unilib_.IsUpper(*token_unicode.begin());
+      if (!token.value.empty() && is_upper) {
+        dense_features.push_back(1.0);
+      } else {
+        dense_features.push_back(-1.0);
+      }
+    } else {
+      if (!token.value.empty() && isupper(*token.value.begin())) {
+        dense_features.push_back(1.0);
+      } else {
+        dense_features.push_back(-1.0);
+      }
+    }
+  }
+
+  if (options_.extract_selection_mask_feature) {
+    if (is_in_span) {
+      dense_features.push_back(1.0);
+    } else {
+      if (options_.unicode_aware_features) {
+        dense_features.push_back(-1.0);
+      } else {
+        dense_features.push_back(0.0);
+      }
+    }
+  }
+
+  // Add regexp features.
+  if (!regex_patterns_.empty()) {
+    UnicodeText token_unicode =
+        UTF8ToUnicodeText(token.value, /*do_copy=*/false);
+    for (int i = 0; i < regex_patterns_.size(); ++i) {
+      if (!regex_patterns_[i].get()) {
+        dense_features.push_back(-1.0);
+        continue;
+      }
+      auto matcher = regex_patterns_[i]->Matcher(token_unicode);
+      int status;
+      if (matcher->Matches(&status)) {
+        dense_features.push_back(1.0);
+      } else {
+        dense_features.push_back(-1.0);
+      }
+    }
+  }
+
+  return dense_features;
+}
+
 int TokenFeatureExtractor::HashToken(StringPiece token) const {
   if (options_.allowed_chargrams.empty()) {
     return tc2farmhash::Fingerprint64(token) % options_.num_buckets;
@@ -101,15 +179,6 @@
   }
 }
 
-std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures(
-    const Token& token) const {
-  if (options_.unicode_aware_features) {
-    return ExtractCharactergramFeaturesUnicode(token);
-  } else {
-    return ExtractCharactergramFeaturesAscii(token);
-  }
-}
-
 std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesAscii(
     const Token& token) const {
   std::vector<int> result;
@@ -237,63 +306,4 @@
   return result;
 }
 
-bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span,
-                                    std::vector<int>* sparse_features,
-                                    std::vector<float>* dense_features) const {
-  if (sparse_features == nullptr || dense_features == nullptr) {
-    return false;
-  }
-
-  *sparse_features = ExtractCharactergramFeatures(token);
-
-  if (options_.extract_case_feature) {
-    if (options_.unicode_aware_features) {
-      UnicodeText token_unicode =
-          UTF8ToUnicodeText(token.value, /*do_copy=*/false);
-      const bool is_upper = unilib_.IsUpper(*token_unicode.begin());
-      if (!token.value.empty() && is_upper) {
-        dense_features->push_back(1.0);
-      } else {
-        dense_features->push_back(-1.0);
-      }
-    } else {
-      if (!token.value.empty() && isupper(*token.value.begin())) {
-        dense_features->push_back(1.0);
-      } else {
-        dense_features->push_back(-1.0);
-      }
-    }
-  }
-
-  if (options_.extract_selection_mask_feature) {
-    if (is_in_span) {
-      dense_features->push_back(1.0);
-    } else {
-      if (options_.unicode_aware_features) {
-        dense_features->push_back(-1.0);
-      } else {
-        dense_features->push_back(0.0);
-      }
-    }
-  }
-
-  // Add regexp features.
-  if (!regex_patterns_.empty()) {
-    for (int i = 0; i < regex_patterns_.size(); ++i) {
-      if (!regex_patterns_[i].get()) {
-        dense_features->push_back(-1.0);
-        continue;
-      }
-
-      if (regex_patterns_[i]->Matches(token.value)) {
-        dense_features->push_back(1.0);
-      } else {
-        dense_features->push_back(-1.0);
-      }
-    }
-  }
-
-  return true;
-}
-
 }  // namespace libtextclassifier2