Sync of libtextclassifier from Google3. Exported by: knowledge/cerebra/sense/text_classifier/lib/export_to_aosp.sh Bug: 67618889 Test: Builds. Tested also with oc-mr1 and tested that smartselect/sharing features work. Change-Id: I25ad82cdd5eed20c60e83e7eb94dae6ab08b3690

commit: e5ea2abb97e8baf399747bd14f89e6a73dacd584 [log] [tgz]
author: Lukas Zilka <zilka@google.com> Wed Oct 11 10:50:05 2017 +0200
committer: Lukas Zilka <zilka@google.com> Wed Oct 11 20:10:22 2017 +0200
tree: 9bd4299251745ce07140056f4dff21f3e214f0f6
parent: 2623912320d637c03bb879152283ca4dbb655839 [diff] [blame]
diff --git a/smartselect/token-feature-extractor.h b/smartselect/token-feature-extractor.h
index 8287fbd..5afeca4 100644
--- a/smartselect/token-feature-extractor.h
+++ b/smartselect/token-feature-extractor.h

@@ -18,12 +18,14 @@
 #define LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_
 
 #include <memory>
+#include <unordered_set>
 #include <vector>
 
-#include "base.h"
 #include "smartselect/types.h"
 #include "util/strings/stringpiece.h"
+#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
 #include "unicode/regex.h"
+#endif
 
 namespace libtextclassifier {
 
@@ -55,6 +57,12 @@
 
   // Maximum length of a word.
   int max_word_length = 20;
+
+  // List of allowed charactergrams. The extracted charactergrams are filtered
+  // using this list, and charactergrams that are not present are interpreted as
+  // out-of-vocabulary.
+  // If no allowed_chargrams are specified, all charactergrams are allowed.
+  std::unordered_set<std::string> allowed_chargrams;
 };
 
 class TokenFeatureExtractor {
@@ -73,8 +81,16 @@
                std::vector<float>* dense_features) const;
 
   int DenseFeaturesCount() const {
-    return options_.extract_case_feature +
-           options_.extract_selection_mask_feature + regex_patterns_.size();
+    int feature_count =
+        options_.extract_case_feature + options_.extract_selection_mask_feature;
+#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
+    feature_count += regex_patterns_.size();
+#else
+    if (enable_all_caps_feature_) {
+      feature_count += 1;
+    }
+#endif
+    return feature_count;
   }
 
  protected:
@@ -94,8 +110,11 @@
 
  private:
   TokenFeatureExtractorOptions options_;
-
+#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
   std::vector<std::unique_ptr<icu::RegexPattern>> regex_patterns_;
+#else
+  bool enable_all_caps_feature_ = false;
+#endif
 };
 
 }  // namespace libtextclassifier
commit	e5ea2abb97e8baf399747bd14f89e6a73dacd584	[log] [tgz]
author	Lukas Zilka <zilka@google.com>	Wed Oct 11 10:50:05 2017 +0200
committer	Lukas Zilka <zilka@google.com>	Wed Oct 11 20:10:22 2017 +0200
tree	9bd4299251745ce07140056f4dff21f3e214f0f6
parent	2623912320d637c03bb879152283ca4dbb655839 [diff] [blame]