Export lib3 to AOSP (external/libtextclassifier part) 1. Include both annotator (existing one) and actions(new one for smart reply and actions) 2. One more model file. actions_suggestions.model is dropped to /etc/textclassifier./ It is around 7.5mb for now, we will slim down it later. 3. The Java counterpart of the JNI is now moved from frameworks/base to here. Test: atest android.view.textclassifier.TextClassificationManagerTest Change-Id: Icb2458967ef51efa2952b3eaddefbf1f7b359930

commit: 6c4cc67c9849339d4e4dfffcfa3eb2342f767890 [log] [tgz]
author: Tony Mak <tonymak@google.com> Mon Sep 17 11:48:50 2018 +0100
committer: Tony Mak <tonymak@google.com> Tue Sep 25 18:36:59 2018 +0100
tree: 1694602c9fd5abe64a26d6363c82b59baf9fa2b0
parent: 30f477bb6871cfebf3136c71da5c14ef3aa69c97 [diff] [blame]
diff --git a/annotator/tokenizer.cc b/annotator/tokenizer.cc
new file mode 100644
index 0000000..099dccc
--- /dev/null
+++ b/annotator/tokenizer.cc

@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "annotator/tokenizer.h"
+
+#include <algorithm>
+
+#include "utils/base/logging.h"
+#include "utils/strings/utf8.h"
+
+namespace libtextclassifier3 {
+
+Tokenizer::Tokenizer(
+    const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
+    bool split_on_script_change)
+    : split_on_script_change_(split_on_script_change) {
+  for (const TokenizationCodepointRange* range : codepoint_ranges) {
+    codepoint_ranges_.emplace_back(range->UnPack());
+  }
+
+  std::sort(codepoint_ranges_.begin(), codepoint_ranges_.end(),
+            [](const std::unique_ptr<const TokenizationCodepointRangeT>& a,
+               const std::unique_ptr<const TokenizationCodepointRangeT>& b) {
+              return a->start < b->start;
+            });
+}
+
+const TokenizationCodepointRangeT* Tokenizer::FindTokenizationRange(
+    int codepoint) const {
+  auto it = std::lower_bound(
+      codepoint_ranges_.begin(), codepoint_ranges_.end(), codepoint,
+      [](const std::unique_ptr<const TokenizationCodepointRangeT>& range,
+         int codepoint) {
+        // This function compares range with the codepoint for the purpose of
+        // finding the first greater or equal range. Because of the use of
+        // std::lower_bound it needs to return true when range < codepoint;
+        // the first time it will return false the lower bound is found and
+        // returned.
+        //
+        // It might seem weird that the condition is range.end <= codepoint
+        // here but when codepoint == range.end it means it's actually just
+        // outside of the range, thus the range is less than the codepoint.
+        return range->end <= codepoint;
+      });
+  if (it != codepoint_ranges_.end() && (*it)->start <= codepoint &&
+      (*it)->end > codepoint) {
+    return it->get();
+  } else {
+    return nullptr;
+  }
+}
+
+void Tokenizer::GetScriptAndRole(char32 codepoint,
+                                 TokenizationCodepointRange_::Role* role,
+                                 int* script) const {
+  const TokenizationCodepointRangeT* range = FindTokenizationRange(codepoint);
+  if (range) {
+    *role = range->role;
+    *script = range->script_id;
+  } else {
+    *role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
+    *script = kUnknownScript;
+  }
+}
+
+std::vector<Token> Tokenizer::Tokenize(const std::string& text) const {
+  UnicodeText text_unicode = UTF8ToUnicodeText(text, /*do_copy=*/false);
+  return Tokenize(text_unicode);
+}
+
+std::vector<Token> Tokenizer::Tokenize(const UnicodeText& text_unicode) const {
+  std::vector<Token> result;
+  Token new_token("", 0, 0);
+  int codepoint_index = 0;
+
+  int last_script = kInvalidScript;
+  for (auto it = text_unicode.begin(); it != text_unicode.end();
+       ++it, ++codepoint_index) {
+    TokenizationCodepointRange_::Role role;
+    int script;
+    GetScriptAndRole(*it, &role, &script);
+
+    if (role & TokenizationCodepointRange_::Role_SPLIT_BEFORE ||
+        (split_on_script_change_ && last_script != kInvalidScript &&
+         last_script != script)) {
+      if (!new_token.value.empty()) {
+        result.push_back(new_token);
+      }
+      new_token = Token("", codepoint_index, codepoint_index);
+    }
+    if (!(role & TokenizationCodepointRange_::Role_DISCARD_CODEPOINT)) {
+      new_token.value += std::string(
+          it.utf8_data(),
+          it.utf8_data() + GetNumBytesForNonZeroUTF8Char(it.utf8_data()));
+      ++new_token.end;
+    }
+    if (role & TokenizationCodepointRange_::Role_SPLIT_AFTER) {
+      if (!new_token.value.empty()) {
+        result.push_back(new_token);
+      }
+      new_token = Token("", codepoint_index + 1, codepoint_index + 1);
+    }
+
+    last_script = script;
+  }
+  if (!new_token.value.empty()) {
+    result.push_back(new_token);
+  }
+
+  return result;
+}
+
+}  // namespace libtextclassifier3
commit	6c4cc67c9849339d4e4dfffcfa3eb2342f767890	[log] [tgz]
author	Tony Mak <tonymak@google.com>	Mon Sep 17 11:48:50 2018 +0100
committer	Tony Mak <tonymak@google.com>	Tue Sep 25 18:36:59 2018 +0100
tree	1694602c9fd5abe64a26d6363c82b59baf9fa2b0
parent	30f477bb6871cfebf3136c71da5c14ef3aa69c97 [diff] [blame]