Export libtextclassifier to Android Test: atest android.view.textclassifier.TextClassificationManagerTest Change-Id: Id7a31dc60c8f6625ff8f2a9c85689e13b121a5a4

commit: a0f598bf3247728ca0de1b58a9ab4f2a31ccdcd8 [log] [tgz]
author: Tony Mak <tonymak@google.com> Tue Nov 20 20:39:04 2018 +0000
committer: Tony Mak <tonymak@google.com> Wed Nov 21 14:32:37 2018 +0000
tree: 349a16926882e4ae5010eb5b31bebc8230ff9f5f
parent: 64bea469d5f8b8b3ee1468aa84bcfd9425115238 [diff]
diff --git a/utils/base/logging.h b/utils/base/logging.h
index e197780..e8bde39 100644
--- a/utils/base/logging.h
+++ b/utils/base/logging.h

@@ -155,7 +155,7 @@
 
 #endif  // NDEBUG
 
-#ifdef LIBTEXTCLASSIFIER_VLOG
+#ifdef TC3_VLOG
 #define TC3_VLOG(severity)                                     \
   ::libtextclassifier3::logging::LogMessage(                   \
       ::libtextclassifier3::logging::INFO, __FILE__, __LINE__) \

diff --git a/utils/calendar/calendar_test.cc b/utils/calendar/calendar_test.cc
index 02ce63f..a8c3af8 100644
--- a/utils/calendar/calendar_test.cc
+++ b/utils/calendar/calendar_test.cc

@@ -45,7 +45,7 @@
   TC3_LOG(INFO) << result;
 }
 
-#ifdef LIBTEXTCLASSIFIER_CALENDAR_ICU
+#ifdef TC3_CALENDAR_ICU
 TEST_F(CalendarTest, RoundingToGranularity) {
   int64 time;
   DateParseData data;
@@ -238,7 +238,7 @@
       /*granularity=*/GRANULARITY_DAY, &time));
   EXPECT_EQ(time, 1523397600000L /* 11 April 2018 00:00:00 */);
 }
-#endif  // LIBTEXTCLASSIFIER_UNILIB_DUMMY
+#endif  // TC3_UNILIB_DUMMY
 
 }  // namespace
 }  // namespace libtextclassifier3

diff --git a/utils/intents/intent-config.fbs b/utils/intents/intent-config.fbs
new file mode 100755
index 0000000..d350ae4
--- /dev/null
+++ b/utils/intents/intent-config.fbs

@@ -0,0 +1,174 @@
+//
+// Copyright (C) 2018 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+// The type of variable to fetch.
+namespace libtextclassifier3;
+enum AndroidSimpleIntentGeneratorVariableType : int {
+  INVALID_VARIABLE = 0,
+
+  // The raw text that was classified.
+  RAW_TEXT = 1,
+
+  // Text as a URL with explicit protocol. If no protocol was specified, http
+  // is prepended.
+  URL_TEXT = 2,
+
+  // The raw text, but URL encoded.
+  URL_ENCODED_TEXT = 3,
+
+  // For dates/times: the instant of the event in UTC millis.
+  EVENT_TIME_MS_UTC = 4,
+
+  // For dates/times: the start of the event in UTC millis.
+  EVENT_START_MS_UTC = 5,
+
+  // For dates/times: the end of the event in UTC millis.
+  EVENT_END_MS_UTC = 6,
+
+  // Name of the package that's running the classifier.
+  PACKAGE_NAME = 7,
+}
+
+// Enumerates the possible extra types for the simple intent generator.
+namespace libtextclassifier3;
+enum AndroidSimpleIntentGeneratorExtraType : int {
+  INVALID_EXTRA_TYPE = 0,
+  STRING = 1,
+  BOOL = 2,
+  VARIABLE_AS_LONG = 3,
+}
+
+// Enumerates the possible condition types for the simple intent generator.
+namespace libtextclassifier3;
+enum AndroidSimpleIntentGeneratorConditionType : int {
+  INVALID_CONDITION_TYPE = 0,
+
+  // Queries the UserManager for the given boolean restriction. The condition
+  // passes if the result is of getBoolean is false. The name of the
+  // restriction to check is in the string_ field.
+  USER_RESTRICTION_NOT_SET = 1,
+
+  // Checks that the parsed event start time is at least a give number of
+  // milliseconds in the future. (Only valid if there is a parsed event
+  // time) The offset is stored in the int64_ field.
+  EVENT_START_IN_FUTURE_MS = 2,
+}
+
+// Describes how intents for the various entity types should be generated on
+// Android. This is distributed through the model, but not used by
+// libtextclassifier yet - rather, it's passed to the calling Java code, which
+// implements the Intent generation logic.
+namespace libtextclassifier3;
+table AndroidIntentFactoryOptions {
+  entity:[libtextclassifier3.AndroidIntentFactoryEntityOptions];
+}
+
+// Describes how intents should be generated for a particular entity type.
+namespace libtextclassifier3;
+table AndroidIntentFactoryEntityOptions {
+  // The entity type as defined by one of the TextClassifier ENTITY_TYPE
+  // constants. (e.g. "address", "phone", etc.)
+  entity_type:string;
+
+  // List of generators for all the different types of intents that should
+  // be made available for the entity type.
+  generator:[libtextclassifier3.AndroidIntentGeneratorOptions];
+}
+
+// Configures a single Android Intent generator.
+namespace libtextclassifier3;
+table AndroidIntentGeneratorOptions {
+  // Strings for UI elements.
+  strings:[libtextclassifier3.AndroidIntentGeneratorStrings];
+
+  // Generator specific configuration.
+  simple:libtextclassifier3.AndroidSimpleIntentGeneratorOptions;
+}
+
+// Language dependent configuration for an Android Intent generator.
+namespace libtextclassifier3;
+table AndroidIntentGeneratorStrings {
+  // BCP 47 tag for the supported locale. Note that because of API level
+  // restrictions, this must /not/ use wildcards. To e.g. match all English
+  // locales, use only "en" and not "en_*". Reference the java.util.Locale
+  // constructor for details.
+  language_tag:string;
+
+  // Title shown for the action (see RemoteAction.getTitle).
+  title:string;
+
+  // Description shown for the action (see
+  // RemoteAction.getContentDescription).
+  description:string;
+}
+
+// An extra to set on a simple intent generator Intent.
+namespace libtextclassifier3;
+table AndroidSimpleIntentGeneratorExtra {
+  // The name of the extra to set.
+  name:string;
+
+  // The type of the extra to set.
+  type:libtextclassifier3.AndroidSimpleIntentGeneratorExtraType;
+
+  string_:string;
+
+  bool_:bool;
+  int32_:int;
+}
+
+// A condition that needs to be fulfilled for an Intent to get generated.
+namespace libtextclassifier3;
+table AndroidSimpleIntentGeneratorCondition {
+  type:libtextclassifier3.AndroidSimpleIntentGeneratorConditionType;
+
+  string_:string;
+
+  int32_:int;
+  int64_:long;
+}
+
+// Configures an intent generator where the logic is simple to be expressed with
+// basic rules - which covers the vast majority of use cases and is analogous
+// to Android Actions.
+// Most strings (action, data, type, ...) may contain variable references. To
+// use them, the generator must first declare all the variables it wishes to use
+// in the variables field. The values then become available as numbered
+// arguments (using the normal java.util.Formatter syntax) in the order they
+// were specified.
+namespace libtextclassifier3;
+table AndroidSimpleIntentGeneratorOptions {
+  // The action to set on the Intent (see Intent.setAction). Supports variables.
+  action:string;
+
+  // The data to set on the Intent (see Intent.setData). Supports variables.
+  data:string;
+
+  // The type to set on the Intent (see Intent.setType). Supports variables.
+  type:string;
+
+  // The list of all the extras to add to the Intent.
+  extra:[libtextclassifier3.AndroidSimpleIntentGeneratorExtra];
+
+  // The list of all the variables that become available for substitution in
+  // the action, data, type and extra strings. To e.g. set a field to the value
+  // of the first variable, use "%0$s".
+  variable:[libtextclassifier3.AndroidSimpleIntentGeneratorVariableType];
+
+  // The list of all conditions that need to be fulfilled for Intent generation.
+  condition:[libtextclassifier3.AndroidSimpleIntentGeneratorCondition];
+}
+

diff --git a/utils/sentencepiece/encoder.cc b/utils/sentencepiece/encoder.cc
index 96fb868..6ffb0c7 100644
--- a/utils/sentencepiece/encoder.cc
+++ b/utils/sentencepiece/encoder.cc

@@ -35,6 +35,17 @@
       normalized_text.RemovePrefix(1);
       continue;
     }
+    // Check whether we can use the unknown token.
+    if (unknown_code_ >= 0) {
+      const int pos = i + 1;
+      const float unknown_penalty = segmentation[i].score + unknown_score_;
+      if (segmentation[pos].previous_pos < 0 ||
+          segmentation[pos].score < unknown_penalty) {
+        segmentation[pos] = {/*score=*/unknown_penalty, /*previous_pos=*/i,
+                             /*piece_id=*/unknown_code_,
+                             /*num_pieces=*/segmentation[i].num_pieces + 1};
+      }
+    }
     for (const auto& match : matcher_->FindAllPrefixMatches(normalized_text)) {
       TC3_CHECK(match.id >= 0 && match.id < num_pieces_);
       const int pos = i + match.match_length;
@@ -42,7 +53,7 @@
       if (segmentation[pos].previous_pos < 0 ||
           segmentation[pos].score < candidate_score) {
         segmentation[pos] = {/*score=*/candidate_score, /*previous_pos=*/i,
-                             /*piece_id=*/match.id,
+                             /*piece_id=*/match.id + encoding_offset_,
                              /*num_pieces=*/segmentation[i].num_pieces + 1};
       }
     }
@@ -57,7 +68,7 @@
   result[num_pieces + 1] = end_code_;
   int pos = len;
   for (int i = num_pieces; i > 0; i--) {
-    result[i] = segmentation[pos].piece_id + encoding_offset_;
+    result[i] = segmentation[pos].piece_id;
     pos = segmentation[pos].previous_pos;
   }
   result[0] = start_code_;

diff --git a/utils/sentencepiece/encoder.h b/utils/sentencepiece/encoder.h
index fffd86f..0f1bfd3 100644
--- a/utils/sentencepiece/encoder.h
+++ b/utils/sentencepiece/encoder.h

@@ -33,19 +33,24 @@
   //     a trie.
   // num_pieces: the number of pieces in the trie.
   // pieces_scores: the scores of the individual pieces.
-  // start_code: Code that is used as encoding of the start of input.
-  // end_code: Code that is used as encoding of the end of input.
-  // encoding_offset: Value added to the sentence piece ids to make them
+  // start_code: code that is used as encoding of the start of input.
+  // end_code: code that is used as encoding of the end of input.
+  // encoding_offset: value added to the sentence piece ids to make them
   //     not interesecting with start_code and end_code.
+  // unknown_code: code that is used for out-of-dictionary characters.
+  // unknown_score: the penality score associated with the unknown code.
   Encoder(const SentencePieceMatcher* matcher, const int num_pieces,
           const float* pieces_scores, int start_code = 0, int end_code = 1,
-          int encoding_offset = 2)
+          int encoding_offset = 2, int unknown_code = -1,
+          float unknown_score = 0.f)
       : num_pieces_(num_pieces),
         scores_(pieces_scores),
         matcher_(matcher),
         start_code_(start_code),
         end_code_(end_code),
-        encoding_offset_(encoding_offset) {}
+        encoding_offset_(encoding_offset),
+        unknown_code_(unknown_code),
+        unknown_score_(unknown_score) {}
 
   // Segment the input so that the total score of the pieces used is maximized.
   // This is a simplified implementation of the general Viterbi algorithm,
@@ -74,6 +79,8 @@
   const int start_code_;
   const int end_code_;
   const int encoding_offset_;
+  const int unknown_code_;
+  const int unknown_score_;
 };
 
 }  // namespace libtextclassifier3

diff --git a/utils/sentencepiece/encoder_test.cc b/utils/sentencepiece/encoder_test.cc
index 59c12ad..6bc9aeb 100644
--- a/utils/sentencepiece/encoder_test.cc
+++ b/utils/sentencepiece/encoder_test.cc

@@ -26,7 +26,7 @@
 namespace libtextclassifier3 {
 namespace {
 
-using testing::ElementsAreArray;
+using testing::ElementsAre;
 using testing::IsEmpty;
 
 TEST(EncoderTest, SimpleTokenization) {
@@ -38,12 +38,12 @@
   const Encoder encoder(matcher.get(),
                         /*num_pieces=*/4, scores);
 
-  EXPECT_THAT(encoder.Encode("hellothere"), ElementsAreArray({0, 3, 5, 1}));
+  EXPECT_THAT(encoder.Encode("hellothere"), ElementsAre(0, 3, 5, 1));
 
   // Make probability of hello very low:
   // hello gets now tokenized as hell + o.
   scores[1] = -100.0;
-  EXPECT_THAT(encoder.Encode("hellothere"), ElementsAreArray({0, 2, 4, 5, 1}));
+  EXPECT_THAT(encoder.Encode("hellothere"), ElementsAre(0, 2, 4, 5, 1));
 }
 
 TEST(EncoderTest, HandlesEdgeCases) {
@@ -54,10 +54,28 @@
       /*num_pieces=*/4, offsets, StringPiece(pieces, 18)));
   const Encoder encoder(matcher.get(),
                         /*num_pieces=*/4, scores);
-  EXPECT_THAT(encoder.Encode("hellhello"), ElementsAreArray({0, 2, 3, 1}));
-  EXPECT_THAT(encoder.Encode("hellohell"), ElementsAreArray({0, 3, 2, 1}));
-  EXPECT_THAT(encoder.Encode(""), ElementsAreArray({0, 1}));
-  EXPECT_THAT(encoder.Encode("hellathere"), ElementsAreArray({0, 1}));
+  EXPECT_THAT(encoder.Encode("hellhello"), ElementsAre(0, 2, 3, 1));
+  EXPECT_THAT(encoder.Encode("hellohell"), ElementsAre(0, 3, 2, 1));
+  EXPECT_THAT(encoder.Encode(""), ElementsAre(0, 1));
+  EXPECT_THAT(encoder.Encode("hellathere"), ElementsAre(0, 1));
+}
+
+TEST(EncoderTest, HandlesOutOfDictionary) {
+  const char pieces[] = "hell\0hello\0o\0there\0";
+  const int offsets[] = {0, 5, 11, 13};
+  float scores[] = {-0.5, -1.0, -10.0, -1.0};
+  std::unique_ptr<SentencePieceMatcher> matcher(new SortedStringsTable(
+      /*num_pieces=*/4, offsets, StringPiece(pieces, 18)));
+  const Encoder encoder(matcher.get(),
+                        /*num_pieces=*/4, scores,
+                        /*start_code=*/0, /*end_code=*/1,
+                        /*encoding_offset=*/3, /*unknown_code=*/2,
+                        /*unknown_score=*/-100.0);
+  EXPECT_THAT(encoder.Encode("hellhello"), ElementsAre(0, 3, 4, 1));
+  EXPECT_THAT(encoder.Encode("hellohell"), ElementsAre(0, 4, 3, 1));
+  EXPECT_THAT(encoder.Encode(""), ElementsAre(0, 1));
+  EXPECT_THAT(encoder.Encode("hellathere"),
+              ElementsAre(0, /*hell*/ 3, /*unknown*/ 2, /*there*/ 6, 1));
 }
 
 }  // namespace

diff --git a/utils/sentencepiece/normalizer.cc b/utils/sentencepiece/normalizer.cc
index 9fcc1e5..1dd20da 100644
--- a/utils/sentencepiece/normalizer.cc
+++ b/utils/sentencepiece/normalizer.cc

@@ -21,7 +21,7 @@
 
 namespace libtextclassifier3 {
 
-std::string Normalizer::Normalize(StringPiece input) const {
+std::string SentencePieceNormalizer::Normalize(StringPiece input) const {
   std::string normalized;
 
   // Ignores heading space.
@@ -106,7 +106,7 @@
   return normalized;
 }
 
-std::pair<StringPiece, int> Normalizer::NormalizePrefix(
+std::pair<StringPiece, int> SentencePieceNormalizer::NormalizePrefix(
     StringPiece input) const {
   std::pair<StringPiece, int> result;
   if (input.empty()) return result;

diff --git a/utils/sentencepiece/normalizer.h b/utils/sentencepiece/normalizer.h
index 582d563..227e09b 100644
--- a/utils/sentencepiece/normalizer.h
+++ b/utils/sentencepiece/normalizer.h

@@ -27,7 +27,7 @@
 
 // Normalizer implements a simple text normalizer with user-defined
 // string-to-string rules and leftmost longest matching.
-class Normalizer {
+class SentencePieceNormalizer {
  public:
   // charsmap_trie and charsmap_normalized specify the normalization/replacement
   // string-to-string rules in the following way:
@@ -41,10 +41,11 @@
   //   internal whitespace.
   //
   // escape_whitespaces: Whether to replace whitespace with a meta symbol.
-  Normalizer(const DoubleArrayTrie &charsmap_trie,
-             StringPiece charsmap_normalized, bool add_dummy_prefix = true,
-             bool remove_extra_whitespaces = true,
-             bool escape_whitespaces = true)
+  SentencePieceNormalizer(const DoubleArrayTrie &charsmap_trie,
+                          StringPiece charsmap_normalized,
+                          bool add_dummy_prefix = true,
+                          bool remove_extra_whitespaces = true,
+                          bool escape_whitespaces = true)
       : charsmap_trie_(charsmap_trie),
         charsmap_normalized_(charsmap_normalized),
         add_dummy_prefix_(add_dummy_prefix),

diff --git a/utils/sentencepiece/normalizer_test.cc b/utils/sentencepiece/normalizer_test.cc
index 143e795..f6018ab 100644
--- a/utils/sentencepiece/normalizer_test.cc
+++ b/utils/sentencepiece/normalizer_test.cc

@@ -36,9 +36,10 @@
   std::ifstream test_config_stream(GetTestConfigPath());
   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
                      (std::istreambuf_iterator<char>()));
-  Normalizer normalizer = NormalizerFromSpec(config, /*add_dummy_prefix=*/true,
-                                             /*remove_extra_whitespaces=*/true,
-                                             /*escape_whitespaces=*/true);
+  SentencePieceNormalizer normalizer =
+      NormalizerFromSpec(config, /*add_dummy_prefix=*/true,
+                         /*remove_extra_whitespaces=*/true,
+                         /*escape_whitespaces=*/true);
 
   EXPECT_EQ(normalizer.Normalize("hello there"), "▁hello▁there");
 
@@ -63,9 +64,10 @@
   std::ifstream test_config_stream(GetTestConfigPath());
   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
                      (std::istreambuf_iterator<char>()));
-  Normalizer normalizer = NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
-                                             /*remove_extra_whitespaces=*/true,
-                                             /*escape_whitespaces=*/true);
+  SentencePieceNormalizer normalizer =
+      NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
+                         /*remove_extra_whitespaces=*/true,
+                         /*escape_whitespaces=*/true);
 
   EXPECT_EQ(normalizer.Normalize("hello there"), "hello▁there");
 
@@ -90,9 +92,10 @@
   std::ifstream test_config_stream(GetTestConfigPath());
   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
                      (std::istreambuf_iterator<char>()));
-  Normalizer normalizer = NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
-                                             /*remove_extra_whitespaces=*/false,
-                                             /*escape_whitespaces=*/true);
+  SentencePieceNormalizer normalizer =
+      NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
+                         /*remove_extra_whitespaces=*/false,
+                         /*escape_whitespaces=*/true);
 
   EXPECT_EQ(normalizer.Normalize("hello there"), "hello▁there");
 
@@ -108,9 +111,10 @@
   std::ifstream test_config_stream(GetTestConfigPath());
   std::string config((std::istreambuf_iterator<char>(test_config_stream)),
                      (std::istreambuf_iterator<char>()));
-  Normalizer normalizer = NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
-                                             /*remove_extra_whitespaces=*/false,
-                                             /*escape_whitespaces=*/false);
+  SentencePieceNormalizer normalizer =
+      NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
+                         /*remove_extra_whitespaces=*/false,
+                         /*escape_whitespaces=*/false);
 
   EXPECT_EQ(normalizer.Normalize("hello there"), "hello there");
 

diff --git a/utils/sentencepiece/test_utils.cc b/utils/sentencepiece/test_utils.cc
index 1b766ac..1ed2bf3 100644
--- a/utils/sentencepiece/test_utils.cc
+++ b/utils/sentencepiece/test_utils.cc

@@ -24,15 +24,16 @@
 
 namespace libtextclassifier3 {
 
-Normalizer NormalizerFromSpec(StringPiece spec, bool add_dummy_prefix,
-                              bool remove_extra_whitespaces,
-                              bool escape_whitespaces) {
+SentencePieceNormalizer NormalizerFromSpec(StringPiece spec,
+                                           bool add_dummy_prefix,
+                                           bool remove_extra_whitespaces,
+                                           bool escape_whitespaces) {
   const uint32 trie_blob_size = reinterpret_cast<const uint32*>(spec.data())[0];
   spec.RemovePrefix(sizeof(trie_blob_size));
   const TrieNode* trie_blob = reinterpret_cast<const TrieNode*>(spec.data());
   spec.RemovePrefix(trie_blob_size);
   const int num_nodes = trie_blob_size / sizeof(TrieNode);
-  return Normalizer(
+  return SentencePieceNormalizer(
       DoubleArrayTrie(trie_blob, num_nodes),
       /*charsmap_normalized=*/StringPiece(spec.data(), spec.size()),
       add_dummy_prefix, remove_extra_whitespaces, escape_whitespaces);

diff --git a/utils/sentencepiece/test_utils.h b/utils/sentencepiece/test_utils.h
index 71a4994..0c833da 100644
--- a/utils/sentencepiece/test_utils.h
+++ b/utils/sentencepiece/test_utils.h

@@ -25,9 +25,10 @@
 
 namespace libtextclassifier3 {
 
-Normalizer NormalizerFromSpec(StringPiece spec, bool add_dummy_prefix,
-                              bool remove_extra_whitespaces,
-                              bool escape_whitespaces);
+SentencePieceNormalizer NormalizerFromSpec(StringPiece spec,
+                                           bool add_dummy_prefix,
+                                           bool remove_extra_whitespaces,
+                                           bool escape_whitespaces);
 
 }  // namespace libtextclassifier3
 

diff --git a/utils/tflite/text_encoder.cc b/utils/tflite/text_encoder.cc
index 9554283..734b5b0 100644
--- a/utils/tflite/text_encoder.cc
+++ b/utils/tflite/text_encoder.cc

@@ -35,7 +35,7 @@
 namespace {
 
 struct TextEncoderOp {
-  std::unique_ptr<Normalizer> normalizer;
+  std::unique_ptr<SentencePieceNormalizer> normalizer;
   std::unique_ptr<Encoder> encoder;
   std::unique_ptr<SentencePieceMatcher> matcher;
 };
@@ -81,7 +81,7 @@
       config->normalization_charsmap()->Data());
   const int charsmap_trie_nodes_length =
       config->normalization_charsmap()->Length() / sizeof(TrieNode);
-  encoder_op->normalizer.reset(new Normalizer(
+  encoder_op->normalizer.reset(new SentencePieceNormalizer(
       DoubleArrayTrie(charsmap_trie_nodes, charsmap_trie_nodes_length),
       StringPiece(config->normalization_charsmap_values()->data(),
                   config->normalization_charsmap_values()->size()),
@@ -113,7 +113,8 @@
   }
   encoder_op->encoder.reset(new Encoder(
       encoder_op->matcher.get(), num_pieces, config->pieces_scores()->data(),
-      config->start_code(), config->end_code(), config->encoding_offset()));
+      config->start_code(), config->end_code(), config->encoding_offset(),
+      config->unknown_code(), config->unknown_score()));
   return encoder_op.release();
 }
 

diff --git a/utils/tflite/text_encoder_config.fbs b/utils/tflite/text_encoder_config.fbs
index 462da21..8ae8fc5 100644
--- a/utils/tflite/text_encoder_config.fbs
+++ b/utils/tflite/text_encoder_config.fbs

@@ -34,6 +34,12 @@
   // `start_code` and `end_code`.
   encoding_offset:int32 = 2;
 
+  // Code that is used for out-of-dictionary characters.
+  unknown_code:int32 = -1;
+
+  // Penalty associated with the unknown code.
+  unknown_score:float;
+
   // Normalization options.
   // Serialized normalization charsmap.
   normalization_charsmap:string;

diff --git a/utils/tflite/text_encoder_test.cc b/utils/tflite/text_encoder_test.cc
index 0b6ff71..0cd67ce 100644
--- a/utils/tflite/text_encoder_test.cc
+++ b/utils/tflite/text_encoder_test.cc

@@ -20,6 +20,7 @@
 
 #include "utils/tflite/text_encoder.h"
 #include "gtest/gtest.h"
+#include "third_party/absl/flags/flag.h"
 #include "flatbuffers/flexbuffers.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"

diff --git a/utils/utf8/unilib_test.cc b/utils/utf8/unilib_test.cc
index e2ad26b..96b2c2d 100644
--- a/utils/utf8/unilib_test.cc
+++ b/utils/utf8/unilib_test.cc

@@ -50,7 +50,7 @@
   EXPECT_EQ(unilib_.GetPairedBracket('}'), '{');
 }
 
-#ifndef LIBTEXTCLASSIFIER_UNILIB_DUMMY
+#ifndef TC3_UNILIB_DUMMY
 TEST_F(UniLibTest, CharacterClassesUnicode) {
   EXPECT_TRUE(unilib_.IsOpeningBracket(0x0F3C));  // TIBET ANG KHANG GYON
   EXPECT_TRUE(unilib_.IsClosingBracket(0x0F3D));  // TIBET ANG KHANG GYAS
@@ -72,7 +72,7 @@
   EXPECT_EQ(unilib_.GetPairedBracket(0x0F3C), 0x0F3D);
   EXPECT_EQ(unilib_.GetPairedBracket(0x0F3D), 0x0F3C);
 }
-#endif  // ndef LIBTEXTCLASSIFIER_UNILIB_DUMMY
+#endif  // ndef TC3_UNILIB_DUMMY
 
 TEST_F(UniLibTest, RegexInterface) {
   const UnicodeText regex_pattern =
@@ -89,7 +89,7 @@
   TC3_LOG(INFO) << matcher->Group(0, &status).size_codepoints();
 }
 
-#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
+#ifdef TC3_UNILIB_ICU
 TEST_F(UniLibTest, Regex) {
   // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
   // test the regex functionality with it to verify we are handling the indices
@@ -126,9 +126,9 @@
   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123😋");
   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
 }
-#endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
+#endif  // TC3_UNILIB_ICU
 
-#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
+#ifdef TC3_UNILIB_ICU
 TEST_F(UniLibTest, RegexGroups) {
   // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
   // test the regex functionality with it to verify we are handling the indices
@@ -163,9 +163,9 @@
   EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "123");
   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
 }
-#endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
+#endif  // TC3_UNILIB_ICU
 
-#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
+#ifdef TC3_UNILIB_ICU
 
 TEST_F(UniLibTest, BreakIterator) {
   const UnicodeText text = UTF8ToUnicodeText("some text", /*do_copy=*/false);
@@ -178,9 +178,9 @@
   }
   EXPECT_THAT(break_indices, ElementsAre(4, 5, 9));
 }
-#endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
+#endif  // TC3_UNILIB_ICU
 
-#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
+#ifdef TC3_UNILIB_ICU
 TEST_F(UniLibTest, BreakIterator4ByteUTF8) {
   const UnicodeText text = UTF8ToUnicodeText("😀😂😋", /*do_copy=*/false);
   std::unique_ptr<UniLib::BreakIterator> iterator =
@@ -192,18 +192,18 @@
   }
   EXPECT_THAT(break_indices, ElementsAre(1, 2, 3));
 }
-#endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
+#endif  // TC3_UNILIB_ICU
 
-#ifndef LIBTEXTCLASSIFIER_UNILIB_JAVAICU
+#ifndef TC3_UNILIB_JAVAICU
 TEST_F(UniLibTest, IntegerParse) {
   int result;
   EXPECT_TRUE(
       unilib_.ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false), &result));
   EXPECT_EQ(result, 123);
 }
-#endif  // ndef LIBTEXTCLASSIFIER_UNILIB_JAVAICU
+#endif  // ndef TC3_UNILIB_JAVAICU
 
-#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
+#ifdef TC3_UNILIB_ICU
 TEST_F(UniLibTest, IntegerParseFullWidth) {
   int result;
   // The input string here is full width
@@ -211,16 +211,16 @@
                                  &result));
   EXPECT_EQ(result, 123);
 }
-#endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
+#endif  // TC3_UNILIB_ICU
 
-#ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
+#ifdef TC3_UNILIB_ICU
 TEST_F(UniLibTest, IntegerParseFullWidthWithAlpha) {
   int result;
   // The input string here is full width
   EXPECT_FALSE(unilib_.ParseInt32(UTF8ToUnicodeText("１a３", /*do_copy=*/false),
                                   &result));
 }
-#endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
+#endif  // TC3_UNILIB_ICU
 
 }  // namespace
 }  // namespace libtextclassifier3
commit	a0f598bf3247728ca0de1b58a9ab4f2a31ccdcd8	[log] [tgz]
author	Tony Mak <tonymak@google.com>	Tue Nov 20 20:39:04 2018 +0000
committer	Tony Mak <tonymak@google.com>	Wed Nov 21 14:32:37 2018 +0000
tree	349a16926882e4ae5010eb5b31bebc8230ff9f5f
parent	64bea469d5f8b8b3ee1468aa84bcfd9425115238 [diff]