Export lib3 to AOSP (external/libtextclassifier part) 1. Include both annotator (existing one) and actions(new one for smart reply and actions) 2. One more model file. actions_suggestions.model is dropped to /etc/textclassifier./ It is around 7.5mb for now, we will slim down it later. 3. The Java counterpart of the JNI is now moved from frameworks/base to here. Test: atest android.view.textclassifier.TextClassificationManagerTest Change-Id: Icb2458967ef51efa2952b3eaddefbf1f7b359930

commit: 6c4cc67c9849339d4e4dfffcfa3eb2342f767890 [log] [tgz]
author: Tony Mak <tonymak@google.com> Mon Sep 17 11:48:50 2018 +0100
committer: Tony Mak <tonymak@google.com> Tue Sep 25 18:36:59 2018 +0100
tree: 1694602c9fd5abe64a26d6363c82b59baf9fa2b0
parent: 30f477bb6871cfebf3136c71da5c14ef3aa69c97 [diff] [blame]
diff --git a/annotator/token-feature-extractor.cc b/annotator/token-feature-extractor.cc
new file mode 100644
index 0000000..86ab03a
--- /dev/null
+++ b/annotator/token-feature-extractor.cc

@@ -0,0 +1,311 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "annotator/token-feature-extractor.h"
+
+#include <cctype>
+#include <string>
+
+#include "utils/base/logging.h"
+#include "utils/hash/farmhash.h"
+#include "utils/strings/stringpiece.h"
+#include "utils/utf8/unicodetext.h"
+
+namespace libtextclassifier3 {
+
+namespace {
+
+std::string RemapTokenAscii(const std::string& token,
+                            const TokenFeatureExtractorOptions& options) {
+  if (!options.remap_digits && !options.lowercase_tokens) {
+    return token;
+  }
+
+  std::string copy = token;
+  for (int i = 0; i < token.size(); ++i) {
+    if (options.remap_digits && isdigit(copy[i])) {
+      copy[i] = '0';
+    }
+    if (options.lowercase_tokens) {
+      copy[i] = tolower(copy[i]);
+    }
+  }
+  return copy;
+}
+
+void RemapTokenUnicode(const std::string& token,
+                       const TokenFeatureExtractorOptions& options,
+                       const UniLib& unilib, UnicodeText* remapped) {
+  if (!options.remap_digits && !options.lowercase_tokens) {
+    // Leave remapped untouched.
+    return;
+  }
+
+  UnicodeText word = UTF8ToUnicodeText(token, /*do_copy=*/false);
+  remapped->clear();
+  for (auto it = word.begin(); it != word.end(); ++it) {
+    if (options.remap_digits && unilib.IsDigit(*it)) {
+      remapped->AppendCodepoint('0');
+    } else if (options.lowercase_tokens) {
+      remapped->AppendCodepoint(unilib.ToLower(*it));
+    } else {
+      remapped->AppendCodepoint(*it);
+    }
+  }
+}
+
+}  // namespace
+
+TokenFeatureExtractor::TokenFeatureExtractor(
+    const TokenFeatureExtractorOptions& options, const UniLib& unilib)
+    : options_(options), unilib_(unilib) {
+  for (const std::string& pattern : options.regexp_features) {
+    regex_patterns_.push_back(std::unique_ptr<UniLib::RegexPattern>(
+        unilib_.CreateRegexPattern(UTF8ToUnicodeText(
+            pattern.c_str(), pattern.size(), /*do_copy=*/false))));
+  }
+}
+
+bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span,
+                                    std::vector<int>* sparse_features,
+                                    std::vector<float>* dense_features) const {
+  if (!dense_features) {
+    return false;
+  }
+  if (sparse_features) {
+    *sparse_features = ExtractCharactergramFeatures(token);
+  }
+  *dense_features = ExtractDenseFeatures(token, is_in_span);
+  return true;
+}
+
+std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures(
+    const Token& token) const {
+  if (options_.unicode_aware_features) {
+    return ExtractCharactergramFeaturesUnicode(token);
+  } else {
+    return ExtractCharactergramFeaturesAscii(token);
+  }
+}
+
+std::vector<float> TokenFeatureExtractor::ExtractDenseFeatures(
+    const Token& token, bool is_in_span) const {
+  std::vector<float> dense_features;
+
+  if (options_.extract_case_feature) {
+    if (options_.unicode_aware_features) {
+      UnicodeText token_unicode =
+          UTF8ToUnicodeText(token.value, /*do_copy=*/false);
+      const bool is_upper = unilib_.IsUpper(*token_unicode.begin());
+      if (!token.value.empty() && is_upper) {
+        dense_features.push_back(1.0);
+      } else {
+        dense_features.push_back(-1.0);
+      }
+    } else {
+      if (!token.value.empty() && isupper(*token.value.begin())) {
+        dense_features.push_back(1.0);
+      } else {
+        dense_features.push_back(-1.0);
+      }
+    }
+  }
+
+  if (options_.extract_selection_mask_feature) {
+    if (is_in_span) {
+      dense_features.push_back(1.0);
+    } else {
+      if (options_.unicode_aware_features) {
+        dense_features.push_back(-1.0);
+      } else {
+        dense_features.push_back(0.0);
+      }
+    }
+  }
+
+  // Add regexp features.
+  if (!regex_patterns_.empty()) {
+    UnicodeText token_unicode =
+        UTF8ToUnicodeText(token.value, /*do_copy=*/false);
+    for (int i = 0; i < regex_patterns_.size(); ++i) {
+      if (!regex_patterns_[i].get()) {
+        dense_features.push_back(-1.0);
+        continue;
+      }
+      auto matcher = regex_patterns_[i]->Matcher(token_unicode);
+      int status;
+      if (matcher->Matches(&status)) {
+        dense_features.push_back(1.0);
+      } else {
+        dense_features.push_back(-1.0);
+      }
+    }
+  }
+
+  return dense_features;
+}
+
+int TokenFeatureExtractor::HashToken(StringPiece token) const {
+  if (options_.allowed_chargrams.empty()) {
+    return tc2farmhash::Fingerprint64(token) % options_.num_buckets;
+  } else {
+    // Padding and out-of-vocabulary tokens have extra buckets reserved because
+    // they are special and important tokens, and we don't want them to share
+    // embedding with other charactergrams.
+    // TODO(zilka): Experimentally verify.
+    const int kNumExtraBuckets = 2;
+    const std::string token_string = token.ToString();
+    if (token_string == "<PAD>") {
+      return 1;
+    } else if (options_.allowed_chargrams.find(token_string) ==
+               options_.allowed_chargrams.end()) {
+      return 0;  // Out-of-vocabulary.
+    } else {
+      return (tc2farmhash::Fingerprint64(token) %
+              (options_.num_buckets - kNumExtraBuckets)) +
+             kNumExtraBuckets;
+    }
+  }
+}
+
+std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesAscii(
+    const Token& token) const {
+  std::vector<int> result;
+  if (token.is_padding || token.value.empty()) {
+    result.push_back(HashToken("<PAD>"));
+  } else {
+    const std::string word = RemapTokenAscii(token.value, options_);
+
+    // Trim words that are over max_word_length characters.
+    const int max_word_length = options_.max_word_length;
+    std::string feature_word;
+    if (word.size() > max_word_length) {
+      feature_word =
+          "^" + word.substr(0, max_word_length / 2) + "\1" +
+          word.substr(word.size() - max_word_length / 2, max_word_length / 2) +
+          "$";
+    } else {
+      // Add a prefix and suffix to the word.
+      feature_word = "^" + word + "$";
+    }
+
+    // Upper-bound the number of charactergram extracted to avoid resizing.
+    result.reserve(options_.chargram_orders.size() * feature_word.size());
+
+    if (options_.chargram_orders.empty()) {
+      result.push_back(HashToken(feature_word));
+    } else {
+      // Generate the character-grams.
+      for (int chargram_order : options_.chargram_orders) {
+        if (chargram_order == 1) {
+          for (int i = 1; i < feature_word.size() - 1; ++i) {
+            result.push_back(
+                HashToken(StringPiece(feature_word, /*offset=*/i, /*len=*/1)));
+          }
+        } else {
+          for (int i = 0;
+               i < static_cast<int>(feature_word.size()) - chargram_order + 1;
+               ++i) {
+            result.push_back(HashToken(StringPiece(feature_word, /*offset=*/i,
+                                                   /*len=*/chargram_order)));
+          }
+        }
+      }
+    }
+  }
+  return result;
+}
+
+std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesUnicode(
+    const Token& token) const {
+  std::vector<int> result;
+  if (token.is_padding || token.value.empty()) {
+    result.push_back(HashToken("<PAD>"));
+  } else {
+    UnicodeText word = UTF8ToUnicodeText(token.value, /*do_copy=*/false);
+    RemapTokenUnicode(token.value, options_, unilib_, &word);
+
+    // Trim the word if needed by finding a left-cut point and right-cut point.
+    auto left_cut = word.begin();
+    auto right_cut = word.end();
+    for (int i = 0; i < options_.max_word_length / 2; i++) {
+      if (left_cut < right_cut) {
+        ++left_cut;
+      }
+      if (left_cut < right_cut) {
+        --right_cut;
+      }
+    }
+
+    std::string feature_word;
+    if (left_cut == right_cut) {
+      feature_word = "^" + word.UTF8Substring(word.begin(), word.end()) + "$";
+    } else {
+      // clang-format off
+      feature_word = "^" +
+                     word.UTF8Substring(word.begin(), left_cut) +
+                     "\1" +
+                     word.UTF8Substring(right_cut, word.end()) +
+                     "$";
+      // clang-format on
+    }
+
+    const UnicodeText feature_word_unicode =
+        UTF8ToUnicodeText(feature_word, /*do_copy=*/false);
+
+    // Upper-bound the number of charactergram extracted to avoid resizing.
+    result.reserve(options_.chargram_orders.size() * feature_word.size());
+
+    if (options_.chargram_orders.empty()) {
+      result.push_back(HashToken(feature_word));
+    } else {
+      // Generate the character-grams.
+      for (int chargram_order : options_.chargram_orders) {
+        UnicodeText::const_iterator it_start = feature_word_unicode.begin();
+        UnicodeText::const_iterator it_end = feature_word_unicode.end();
+        if (chargram_order == 1) {
+          ++it_start;
+          --it_end;
+        }
+
+        UnicodeText::const_iterator it_chargram_start = it_start;
+        UnicodeText::const_iterator it_chargram_end = it_start;
+        bool chargram_is_complete = true;
+        for (int i = 0; i < chargram_order; ++i) {
+          if (it_chargram_end == it_end) {
+            chargram_is_complete = false;
+            break;
+          }
+          ++it_chargram_end;
+        }
+        if (!chargram_is_complete) {
+          continue;
+        }
+
+        for (; it_chargram_end <= it_end;
+             ++it_chargram_start, ++it_chargram_end) {
+          const int length_bytes =
+              it_chargram_end.utf8_data() - it_chargram_start.utf8_data();
+          result.push_back(HashToken(
+              StringPiece(it_chargram_start.utf8_data(), length_bytes)));
+        }
+      }
+    }
+  }
+  return result;
+}
+
+}  // namespace libtextclassifier3
commit	6c4cc67c9849339d4e4dfffcfa3eb2342f767890	[log] [tgz]
author	Tony Mak <tonymak@google.com>	Mon Sep 17 11:48:50 2018 +0100
committer	Tony Mak <tonymak@google.com>	Tue Sep 25 18:36:59 2018 +0100
tree	1694602c9fd5abe64a26d6363c82b59baf9fa2b0
parent	30f477bb6871cfebf3136c71da5c14ef3aa69c97 [diff] [blame]