Sync of libtextclassifier from Google3.
Exported by: knowledge/cerebra/sense/text_classifier/lib/export_to_aosp.sh
Bug: 67618889
Test: Builds. Tested also with oc-mr1 and tested that smartselect/sharing features work.
Change-Id: I25ad82cdd5eed20c60e83e7eb94dae6ab08b3690
diff --git a/smartselect/token-feature-extractor.h b/smartselect/token-feature-extractor.h
index 8287fbd..5afeca4 100644
--- a/smartselect/token-feature-extractor.h
+++ b/smartselect/token-feature-extractor.h
@@ -18,12 +18,14 @@
#define LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_
#include <memory>
+#include <unordered_set>
#include <vector>
-#include "base.h"
#include "smartselect/types.h"
#include "util/strings/stringpiece.h"
+#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
#include "unicode/regex.h"
+#endif
namespace libtextclassifier {
@@ -55,6 +57,12 @@
// Maximum length of a word.
int max_word_length = 20;
+
+ // List of allowed charactergrams. The extracted charactergrams are filtered
+ // using this list, and charactergrams that are not present are interpreted as
+ // out-of-vocabulary.
+ // If no allowed_chargrams are specified, all charactergrams are allowed.
+ std::unordered_set<std::string> allowed_chargrams;
};
class TokenFeatureExtractor {
@@ -73,8 +81,16 @@
std::vector<float>* dense_features) const;
int DenseFeaturesCount() const {
- return options_.extract_case_feature +
- options_.extract_selection_mask_feature + regex_patterns_.size();
+ int feature_count =
+ options_.extract_case_feature + options_.extract_selection_mask_feature;
+#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
+ feature_count += regex_patterns_.size();
+#else
+ if (enable_all_caps_feature_) {
+ feature_count += 1;
+ }
+#endif
+ return feature_count;
}
protected:
@@ -94,8 +110,11 @@
private:
TokenFeatureExtractorOptions options_;
-
+#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
std::vector<std::unique_ptr<icu::RegexPattern>> regex_patterns_;
+#else
+ bool enable_all_caps_feature_ = false;
+#endif
};
} // namespace libtextclassifier