Import libtextclassifier changes from google3. This includes an upgrade to the latest version of the model. We now use a test data path constant in the unit tests. This is consistent with another test project, minikin_tests. Test: Tests pass on-device. Bug: 34865247 Change-Id: Ia061888a0bba371c6b429ad7c5457af611ba12f2

commit: be876dc13b917662ad8688da10e688cdaf6ce051 [log] [tgz]
author: Matt Sharifi <mns@google.com> Fri Mar 17 17:02:43 2017 +0100
committer: Matt Sharifi <mns@google.com> Mon Mar 20 12:55:43 2017 +0100
tree: 222b245f9af830fd8c8187967d9a6372e43d45fb
parent: 72282c6d2a320b89df4a3ed75df027f67c4ee6f3 [diff]
diff --git a/Android.mk b/Android.mk
index 823815d..204a97a 100644
--- a/Android.mk
+++ b/Android.mk

@@ -70,6 +70,9 @@
 
 LOCAL_TEST_DATA := $(call find-test-data-in-subdirs, $(LOCAL_PATH), *, tests/testdata)
 
+LOCAL_CPPFLAGS_32 += -DTEST_DATA_DIR="\"/data/nativetest/libtextclassifier_tests/tests/testdata/\""
+LOCAL_CPPFLAGS_64 += -DTEST_DATA_DIR="\"/data/nativetest64/libtextclassifier_tests/tests/testdata/\""
+
 LOCAL_SRC_FILES := $(patsubst ./%,%, $(shell cd $(LOCAL_PATH); \
     find . -name "*.cc" -and -not -name ".*"))
 LOCAL_C_INCLUDES += .

diff --git a/models/textclassifier.smartselection.en.model b/models/textclassifier.smartselection.en.model
index 0530a6f..2b96f42 100644
--- a/models/textclassifier.smartselection.en.model
+++ b/models/textclassifier.smartselection.en.model
Binary files differ

diff --git a/smartselect/feature-processor.cc b/smartselect/feature-processor.cc
index c931021..9e357bd 100644
--- a/smartselect/feature-processor.cc
+++ b/smartselect/feature-processor.cc

@@ -125,27 +125,8 @@
   }
 }
 
-std::pair<SelectionWithContext, int> ExtractLineWithClick(
-    const SelectionWithContext& selection_with_context) {
-  std::string new_context;
-  int shift;
-  std::tie(new_context, shift) = ExtractLineWithSpan(
-      selection_with_context.context,
-      {selection_with_context.click_start, selection_with_context.click_end});
-
-  SelectionWithContext result(selection_with_context);
-  result.selection_start -= shift;
-  result.selection_end -= shift;
-  result.click_start -= shift;
-  result.click_end -= shift;
-  result.context = new_context;
-  return {result, shift};
-}
-
-}  // namespace internal
-
-std::pair<std::string, int> ExtractLineWithSpan(const std::string& context,
-                                                CodepointSpan span) {
+void StripTokensFromOtherLines(const std::string& context, CodepointSpan span,
+                               std::vector<Token>* tokens) {
   const UnicodeText context_unicode = UTF8ToUnicodeText(context,
                                                         /*do_copy=*/false);
   std::vector<UnicodeTextRange> lines;
@@ -164,19 +145,26 @@
   }
   for (const UnicodeTextRange& line : lines) {
     // Find the line that completely contains the span.
-    if (line.first <= span_start && line.second >= span_start &&
-        line.first <= span_end && line.second >= span_end) {
+    if (line.first <= span_start && line.second >= span_end) {
       const CodepointIndex last_line_begin_index =
           std::distance(context_unicode.begin(), line.first);
+      const CodepointIndex last_line_end_index =
+          last_line_begin_index + std::distance(line.first, line.second);
 
-      std::string result =
-          context_unicode.UTF8Substring(line.first, line.second);
-      return {result, last_line_begin_index};
+      for (auto token = tokens->begin(); token != tokens->end();) {
+        if (token->start >= last_line_begin_index &&
+            token->end <= last_line_end_index) {
+          ++token;
+        } else {
+          token = tokens->erase(token);
+        }
+      }
     }
   }
-  return {context, 0};
 }
 
+}  // namespace internal
+
 const char* const FeatureProcessor::kFeatureTypeName = "chargram_continuous";
 
 std::vector<Token> FeatureProcessor::Tokenize(
@@ -308,56 +296,81 @@
   return kInvalidIndex;
 }
 
-// Helper function to get the click position (in terms of index into a vector of
-// selectable tokens), given selectable tokens. If click position is given, it
-// will be used. If it is not given, but selection is given, the click will be
-// the middle token in the selection range.
-int GetClickPosition(const SelectionWithContext& selection_with_context,
-                     const std::vector<Token>& selectable_tokens) {
-  int click_pos = kInvalidIndex;
-  if (selection_with_context.GetClickSpan() !=
-      std::make_pair(kInvalidIndex, kInvalidIndex)) {
-    int range_begin;
-    int range_end;
-    std::tie(range_begin, range_end) = CodepointSpanToTokenSpan(
-        selectable_tokens, selection_with_context.GetClickSpan());
+}  // namespace
 
-    // If no exact match was found, try finding a token that completely contains
-    // the click span. This is useful e.g. when Android builds the selection
-    // using ICU tokenization, and ends up with only a portion of our space-
-    // separated token. E.g. for "(857)" Android would select "857".
-    if (range_begin == kInvalidIndex || range_end == kInvalidIndex) {
-      int token_index = FindTokenThatContainsSpan(
-          selectable_tokens, selection_with_context.GetClickSpan());
-      if (token_index != kInvalidIndex) {
-        range_begin = token_index;
-        range_end = token_index + 1;
-      }
-    }
+namespace internal {
 
-    // We only allow clicks that are exactly 1 selectable token.
-    if (range_end - range_begin == 1) {
-      click_pos = range_begin;
-    } else {
-      click_pos = kInvalidIndex;
-    }
-  } else if (selection_with_context.GetSelectionSpan() !=
-             std::make_pair(kInvalidIndex, kInvalidIndex)) {
-    int range_begin;
-    int range_end;
-    std::tie(range_begin, range_end) = CodepointSpanToTokenSpan(
-        selectable_tokens, selection_with_context.GetSelectionSpan());
+int CenterTokenFromClick(CodepointSpan span,
+                         const std::vector<Token>& selectable_tokens) {
+  int range_begin;
+  int range_end;
+  std::tie(range_begin, range_end) =
+      CodepointSpanToTokenSpan(selectable_tokens, span);
 
-    // Center the clicked token in the selection range.
-    if (range_begin != kInvalidIndex && range_end != kInvalidIndex) {
-      click_pos = (range_begin + range_end - 1) / 2;
+  // If no exact match was found, try finding a token that completely contains
+  // the click span. This is useful e.g. when Android builds the selection
+  // using ICU tokenization, and ends up with only a portion of our space-
+  // separated token. E.g. for "(857)" Android would select "857".
+  if (range_begin == kInvalidIndex || range_end == kInvalidIndex) {
+    int token_index = FindTokenThatContainsSpan(selectable_tokens, span);
+    if (token_index != kInvalidIndex) {
+      range_begin = token_index;
+      range_end = token_index + 1;
     }
   }
 
-  return click_pos;
+  // We only allow clicks that are exactly 1 selectable token.
+  if (range_end - range_begin == 1) {
+    return range_begin;
+  } else {
+    return kInvalidIndex;
+  }
 }
 
-}  // namespace
+int CenterTokenFromMiddleOfSelection(
+    CodepointSpan span, const std::vector<Token>& selectable_tokens) {
+  int range_begin;
+  int range_end;
+  std::tie(range_begin, range_end) =
+      CodepointSpanToTokenSpan(selectable_tokens, span);
+
+  // Center the clicked token in the selection range.
+  if (range_begin != kInvalidIndex && range_end != kInvalidIndex) {
+    return (range_begin + range_end - 1) / 2;
+  } else {
+    return kInvalidIndex;
+  }
+}
+
+}  // namespace internal
+
+int FeatureProcessor::FindCenterToken(CodepointSpan span,
+                                      const std::vector<Token>& tokens) const {
+  if (options_.center_token_selection_method() ==
+      FeatureProcessorOptions::CENTER_TOKEN_FROM_CLICK) {
+    return internal::CenterTokenFromClick(span, tokens);
+  } else if (options_.center_token_selection_method() ==
+             FeatureProcessorOptions::CENTER_TOKEN_MIDDLE_OF_SELECTION) {
+    return internal::CenterTokenFromMiddleOfSelection(span, tokens);
+  } else if (options_.center_token_selection_method() ==
+             FeatureProcessorOptions::DEFAULT_CENTER_TOKEN_METHOD) {
+    // TODO(zilka): This is a HACK not to break the current models. Remove once
+    // we have new models on the device.
+    // It uses the fact that sharing model use
+    // split_tokens_on_selection_boundaries and selection not. So depending on
+    // this we select the right way of finding the click location.
+    if (!options_.split_tokens_on_selection_boundaries()) {
+      // SmartSelection model.
+      return internal::CenterTokenFromClick(span, tokens);
+    } else {
+      // SmartSharing model.
+      return internal::CenterTokenFromMiddleOfSelection(span, tokens);
+    }
+  } else {
+    TC_LOG(ERROR) << "Invalid center token selection method.";
+    return kInvalidIndex;
+  }
+}
 
 std::vector<Token> FeatureProcessor::FindTokensInSelection(
     const std::vector<Token>& selectable_tokens,
@@ -401,8 +414,20 @@
   }
 }
 
+bool FeatureProcessor::GetFeatures(
+    const std::string& context, CodepointSpan input_span,
+    std::vector<nlp_core::FeatureVector>* features,
+    std::vector<float>* extra_features,
+    std::vector<CodepointSpan>* selection_label_spans) const {
+  return FeatureProcessor::GetFeaturesAndLabels(
+      context, input_span, {kInvalidIndex, kInvalidIndex}, "", features,
+      extra_features, selection_label_spans, /*selection_label=*/nullptr,
+      /*selection_codepoint_label=*/nullptr, /*classification_label=*/nullptr);
+}
+
 bool FeatureProcessor::GetFeaturesAndLabels(
-    const SelectionWithContext& selection_with_context,
+    const std::string& context, CodepointSpan input_span,
+    CodepointSpan label_span, const std::string& label_collection,
     std::vector<nlp_core::FeatureVector>* features,
     std::vector<float>* extra_features,
     std::vector<CodepointSpan>* selection_label_spans, int* selection_label,
@@ -413,39 +438,32 @@
   *features =
       std::vector<nlp_core::FeatureVector>(options_.context_size() * 2 + 1);
 
-  SelectionWithContext selection_normalized;
-  int normalization_shift;
-  if (options_.only_use_line_with_click()) {
-    std::tie(selection_normalized, normalization_shift) =
-        internal::ExtractLineWithClick(selection_with_context);
-  } else {
-    selection_normalized = selection_with_context;
-    normalization_shift = 0;
+  std::vector<Token> input_tokens = Tokenize(context);
+
+  if (options_.split_tokens_on_selection_boundaries()) {
+    internal::SplitTokensOnSelectionBoundaries(input_span, &input_tokens);
   }
 
-  std::vector<Token> input_tokens = Tokenize(selection_normalized.context);
-  if (options_.split_tokens_on_selection_boundaries()) {
-    internal::SplitTokensOnSelectionBoundaries(
-        selection_with_context.GetSelectionSpan(), &input_tokens);
+  if (options_.only_use_line_with_click()) {
+    internal::StripTokensFromOtherLines(context, input_span, &input_tokens);
   }
-  int click_pos = GetClickPosition(selection_normalized, input_tokens);
+
+  const int click_pos = FindCenterToken(input_span, input_tokens);
   if (click_pos == kInvalidIndex) {
     TC_LOG(ERROR) << "Could not extract click position.";
     return false;
   }
 
   std::vector<Token> output_tokens;
-  bool status = ComputeFeatures(click_pos, input_tokens,
-                                selection_normalized.GetSelectionSpan(),
-                                features, extra_features, &output_tokens);
+  bool status = ComputeFeatures(click_pos, input_tokens, input_span, features,
+                                extra_features, &output_tokens);
   if (!status) {
     TC_LOG(ERROR) << "Feature computation failed.";
     return false;
   }
 
   if (selection_label != nullptr) {
-    status = SpanToLabel(selection_normalized.GetSelectionSpan(), output_tokens,
-                         selection_label);
+    status = SpanToLabel(label_span, output_tokens, selection_label);
     if (!status) {
       TC_LOG(ERROR) << "Could not convert selection span to label.";
       return false;
@@ -453,20 +471,10 @@
   }
 
   if (selection_codepoint_label != nullptr) {
-    *selection_codepoint_label = selection_with_context.GetSelectionSpan();
+    *selection_codepoint_label = label_span;
   }
 
   if (selection_label_spans != nullptr) {
-    // If an input normalization was performed, we need to shift the tokens
-    // back to get the correct ranges in the original input.
-    if (normalization_shift) {
-      for (Token& token : output_tokens) {
-        if (token.start != kInvalidIndex && token.end != kInvalidIndex) {
-          token.start += normalization_shift;
-          token.end += normalization_shift;
-        }
-      }
-    }
     for (int i = 0; i < label_to_selection_.size(); ++i) {
       CodepointSpan span;
       status = LabelToSpan(i, output_tokens, &span);
@@ -479,14 +487,15 @@
   }
 
   if (classification_label != nullptr) {
-    *classification_label = CollectionToLabel(selection_normalized.collection);
+    *classification_label = CollectionToLabel(label_collection);
   }
 
   return true;
 }
 
 bool FeatureProcessor::GetFeaturesAndLabels(
-    const SelectionWithContext& selection_with_context,
+    const std::string& context, CodepointSpan input_span,
+    CodepointSpan label_span, const std::string& label_collection,
     std::vector<std::vector<std::pair<int, float>>>* features,
     std::vector<float>* extra_features,
     std::vector<CodepointSpan>* selection_label_spans, int* selection_label,
@@ -499,10 +508,10 @@
   }
 
   std::vector<nlp_core::FeatureVector> feature_vectors;
-  bool result = GetFeaturesAndLabels(selection_with_context, &feature_vectors,
-                                     extra_features, selection_label_spans,
-                                     selection_label, selection_codepoint_label,
-                                     classification_label);
+  bool result = GetFeaturesAndLabels(
+      context, input_span, label_span, label_collection, &feature_vectors,
+      extra_features, selection_label_spans, selection_label,
+      selection_codepoint_label, classification_label);
 
   if (!result) {
     return false;

diff --git a/smartselect/feature-processor.h b/smartselect/feature-processor.h
index 712534d..311be3e 100644
--- a/smartselect/feature-processor.h
+++ b/smartselect/feature-processor.h

@@ -43,30 +43,29 @@
 TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
     const FeatureProcessorOptions& options);
 
-// Returns a modified version of the selection_with_context, such that only the
-// line that contains the clicked span is kept, the number of codepoints
-// the selection was moved by.
-std::pair<SelectionWithContext, int> ExtractLineWithClick(
-    const SelectionWithContext& selection_with_context);
+// Removes tokens that are not part of a line of the context which contains
+// given span.
+void StripTokensFromOtherLines(const std::string& context, CodepointSpan span,
+                               std::vector<Token>* tokens);
 
 // Splits tokens that contain the selection boundary inside them.
 // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
 void SplitTokensOnSelectionBoundaries(CodepointSpan selection,
                                       std::vector<Token>* tokens);
 
+// Returns the index of token that corresponds to the codepoint span.
+int CenterTokenFromClick(CodepointSpan span, const std::vector<Token>& tokens);
+
+// Returns the index of token that corresponds to the middle of the  codepoint
+// span.
+int CenterTokenFromMiddleOfSelection(
+    CodepointSpan span, const std::vector<Token>& selectable_tokens);
+
 }  // namespace internal
 
 TokenSpan CodepointSpanToTokenSpan(const std::vector<Token>& selectable_tokens,
                                    CodepointSpan codepoint_span);
 
-// Returns a modified version of the context string, such that only the
-// line that contains the span is kept. Also returns a codepoint shift
-// size that happend. If the span spans multiple lines, returns the original
-// input with zero shift.
-// The following characters are considered to be line separators: '\n', '|'
-std::pair<std::string, int> ExtractLineWithSpan(const std::string& context,
-                                                CodepointSpan span);
-
 // Takes care of preparing features for the FFModel.
 class FeatureProcessor {
  public:
@@ -92,9 +91,16 @@
   // Tokenizes the input string using the selected tokenization method.
   std::vector<Token> Tokenize(const std::string& utf8_text) const;
 
+  bool GetFeatures(const std::string& context, CodepointSpan input_span,
+                   std::vector<nlp_core::FeatureVector>* features,
+                   std::vector<float>* extra_features,
+                   std::vector<CodepointSpan>* selection_label_spans) const;
+
   // NOTE: If dropout is on, subsequent calls of this function with the same
   // arguments might return different results.
-  bool GetFeaturesAndLabels(const SelectionWithContext& selection_with_context,
+  bool GetFeaturesAndLabels(const std::string& context,
+                            CodepointSpan input_span, CodepointSpan label_span,
+                            const std::string& label_collection,
                             std::vector<nlp_core::FeatureVector>* features,
                             std::vector<float>* extra_features,
                             std::vector<CodepointSpan>* selection_label_spans,
@@ -106,7 +112,8 @@
   // NOTE: If dropout is on, subsequent calls of this function with the same
   // arguments might return different results.
   bool GetFeaturesAndLabels(
-      const SelectionWithContext& selection_with_context,
+      const std::string& context, CodepointSpan input_span,
+      CodepointSpan label_span, const std::string& label_collection,
       std::vector<std::vector<std::pair<int, float>>>* features,
       std::vector<float>* extra_features,
       std::vector<CodepointSpan>* selection_label_spans, int* selection_label,
@@ -179,12 +186,17 @@
   // Converts a token span to the corresponding label.
   int TokenSpanToLabel(const std::pair<TokenIndex, TokenIndex>& span) const;
 
-  // Find tokens that are part of the selection.
+  // Finds tokens that are part of the selection.
   // NOTE: Will select all tokens that somehow overlap with the selection.
   std::vector<Token> FindTokensInSelection(
       const std::vector<Token>& selectable_tokens,
       const SelectionWithContext& selection_with_context) const;
 
+  // Finds the center token index in tokens vector, using the method defined
+  // in options_.
+  int FindCenterToken(CodepointSpan span,
+                      const std::vector<Token>& tokens) const;
+
  private:
   FeatureProcessorOptions options_;
 

diff --git a/smartselect/text-classification-model.cc b/smartselect/text-classification-model.cc
index 5fece36..557497e 100644
--- a/smartselect/text-classification-model.cc
+++ b/smartselect/text-classification-model.cc

@@ -51,8 +51,15 @@
 
     // If no tokenization codepoint config is present, tokenize on space.
     if (feature_processor_options.tokenization_codepoint_config_size() == 0) {
-      TokenizationCodepointRange* config =
-          feature_processor_options.add_tokenization_codepoint_config();
+      TokenizationCodepointRange* config;
+      // New line character.
+      config = feature_processor_options.add_tokenization_codepoint_config();
+      config->set_start(10);
+      config->set_end(11);
+      config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
+
+      // Space character.
+      config = feature_processor_options.add_tokenization_codepoint_config();
       config->set_start(32);
       config->set_end(33);
       config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
@@ -186,23 +193,13 @@
 }
 
 EmbeddingNetwork::Vector TextClassificationModel::InferInternal(
-    const std::string& context, CodepointSpan click_indices,
-    CodepointSpan selection_indices, const FeatureProcessor& feature_processor,
-    const EmbeddingNetwork* network,
-    std::vector<CodepointSpan>* selection_label_spans, int* selection_label,
-    CodepointSpan* selection_codepoint_label, int* classification_label) const {
-  SelectionWithContext selection_with_context;
-  selection_with_context.context = context;
-  selection_with_context.click_start = std::get<0>(click_indices);
-  selection_with_context.click_end = std::get<1>(click_indices);
-  selection_with_context.selection_start = std::get<0>(selection_indices);
-  selection_with_context.selection_end = std::get<1>(selection_indices);
-
+    const std::string& context, CodepointSpan span,
+    const FeatureProcessor& feature_processor, const EmbeddingNetwork* network,
+    std::vector<CodepointSpan>* selection_label_spans) const {
   std::vector<FeatureVector> features;
   std::vector<float> extra_features;
-  const bool features_computed = feature_processor.GetFeaturesAndLabels(
-      selection_with_context, &features, &extra_features, selection_label_spans,
-      selection_label, selection_codepoint_label, classification_label);
+  const bool features_computed = feature_processor.GetFeatures(
+      context, span, &features, &extra_features, selection_label_spans);
 
   EmbeddingNetwork::Vector scores;
   if (!features_computed) {
@@ -250,17 +247,10 @@
     return {click_indices, -1.0};
   }
 
-  // Invalid selection indices make the feature extraction use the provided
-  // click indices.
-  const CodepointSpan selection_indices({kInvalidIndex, kInvalidIndex});
-
   std::vector<CodepointSpan> selection_label_spans;
-  EmbeddingNetwork::Vector scores = InferInternal(
-      context, click_indices, selection_indices, *selection_feature_processor_,
-      selection_network_.get(), &selection_label_spans,
-      /*selection_label=*/nullptr,
-      /*selection_codepoint_label=*/nullptr,
-      /*classification_label=*/nullptr);
+  EmbeddingNetwork::Vector scores =
+      InferInternal(context, click_indices, *selection_feature_processor_,
+                    selection_network_.get(), &selection_label_spans);
 
   if (!scores.empty()) {
     scores = nlp_core::ComputeSoftmax(scores);
@@ -317,18 +307,12 @@
 // matter which word of it was tapped - all of them will lead to the same
 // selection.
 CodepointSpan TextClassificationModel::SuggestSelectionSymmetrical(
-    const std::string& full_context, CodepointSpan click_indices) const {
-  // Extract context from the current line only.
-  std::string context;
-  int context_shift;
-  std::tie(context, context_shift) =
-      ExtractLineWithSpan(full_context, click_indices);
-  click_indices.first -= context_shift;
-  click_indices.second -= context_shift;
-
+    const std::string& context, CodepointSpan click_indices) const {
   std::vector<Token> tokens = selection_feature_processor_->Tokenize(context);
+  internal::StripTokensFromOtherLines(context, click_indices, &tokens);
 
-  const int click_index = GetClickTokenIndex(tokens, click_indices);
+  // const int click_index = GetClickTokenIndex(tokens, click_indices);
+  const int click_index = internal::CenterTokenFromClick(click_indices, tokens);
   if (click_index == kInvalidIndex) {
     return click_indices;
   }
@@ -337,7 +321,7 @@
 
   // Scan in the symmetry context for selection span proposals.
   std::vector<std::pair<CodepointSpan, float>> proposals;
-  for (int i = -symmetry_context_size; i < symmetry_context_size + 1; i++) {
+  for (int i = -symmetry_context_size; i < symmetry_context_size + 1; ++i) {
     const int token_index = click_index + i;
     if (token_index >= 0 && token_index < tokens.size()) {
       float score;
@@ -373,8 +357,7 @@
 
       if (feasible) {
         if (span.first <= click_index && span.second > click_index) {
-          return {span_result.first.first + context_shift,
-                  span_result.first.second + context_shift};
+          return {span_result.first.first, span_result.first.second};
         }
         for (int i = span.first; i < span.second; i++) {
           used_tokens[i] = 1;
@@ -383,8 +366,7 @@
     }
   }
 
-  return {click_indices.first + context_shift,
-          click_indices.second + context_shift};
+  return {click_indices.first, click_indices.second};
 }
 
 CodepointSpan TextClassificationModel::SuggestSelection(
@@ -409,13 +391,9 @@
     return {};
   }
 
-  // Invalid click indices make the feature extraction select the middle word in
-  // the selection span.
-  const CodepointSpan click_indices({kInvalidIndex, kInvalidIndex});
-
-  EmbeddingNetwork::Vector scores = InferInternal(
-      context, click_indices, selection_indices, *sharing_feature_processor_,
-      sharing_network_.get(), nullptr, nullptr, nullptr, nullptr);
+  EmbeddingNetwork::Vector scores =
+      InferInternal(context, selection_indices, *sharing_feature_processor_,
+                    sharing_network_.get(), nullptr);
   if (scores.empty()) {
     TC_LOG(ERROR) << "Using default class";
     return {};

diff --git a/smartselect/text-classification-model.h b/smartselect/text-classification-model.h
index c30a4d3..458249a 100644
--- a/smartselect/text-classification-model.h
+++ b/smartselect/text-classification-model.h

@@ -137,13 +137,10 @@
   bool LoadModels(int fd);
 
   nlp_core::EmbeddingNetwork::Vector InferInternal(
-      const std::string& context, CodepointSpan click_indices,
-      CodepointSpan selection_indices,
+      const std::string& context, CodepointSpan span,
       const FeatureProcessor& feature_processor,
       const nlp_core::EmbeddingNetwork* network,
-      std::vector<CodepointSpan>* selection_label_spans, int* selection_label,
-      CodepointSpan* selection_codepoint_label,
-      int* classification_label) const;
+      std::vector<CodepointSpan>* selection_label_spans) const;
 
   // Returns a selection suggestion with a score.
   std::pair<CodepointSpan, float> SuggestSelectionInternal(

diff --git a/smartselect/text-classification-model.proto b/smartselect/text-classification-model.proto
index e098e81..b96ee2f 100644
--- a/smartselect/text-classification-model.proto
+++ b/smartselect/text-classification-model.proto

@@ -98,6 +98,22 @@
   // Codepoint ranges that determine how different codepoints are tokenized.
   // The ranges must not overlap.
   repeated TokenizationCodepointRange tokenization_codepoint_config = 15;
+
+  // Method for selecting the center token.
+  enum CenterTokenSelectionMethod {
+    DEFAULT_CENTER_TOKEN_METHOD = 0;  // Invalid option.
+
+    // Use click indices to determine the center token.
+    CENTER_TOKEN_FROM_CLICK = 1;
+
+    // Use selection indices to get a token range, and select the middle of it
+    // as the center token.
+    CENTER_TOKEN_MIDDLE_OF_SELECTION = 2;
+  }
+  optional CenterTokenSelectionMethod center_token_selection_method = 16;
+
+  // If true, during training will click random token in the example selection.
+  optional bool click_random_token_in_selection = 17 [default = true];
 };
 
 extend nlp_core.EmbeddingNetworkProto {

diff --git a/tests/feature-processor_test.cc b/tests/feature-processor_test.cc
index ecdad16..652db84 100644
--- a/tests/feature-processor_test.cc
+++ b/tests/feature-processor_test.cc

@@ -106,113 +106,95 @@
 }
 
 TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
-  SelectionWithContext selection;
-  selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+  const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+  const CodepointSpan span = {0, 5};
+  // clang-format off
+  std::vector<Token> tokens = {Token("Fiřst", 0, 5),
+                               Token("Lině", 6, 10),
+                               Token("Sěcond", 11, 17),
+                               Token("Lině", 18, 22),
+                               Token("Thiřd", 23, 28),
+                               Token("Lině", 29, 33)};
+  // clang-format on
 
   // Keeps the first line.
-  selection.click_start = 0;
-  selection.click_end = 5;
-  selection.selection_start = 6;
-  selection.selection_end = 10;
-
-  SelectionWithContext line_selection;
-  int shift;
-  std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
-
-  EXPECT_EQ(line_selection.context, "Fiřst Lině");
-  EXPECT_EQ(line_selection.click_start, 0);
-  EXPECT_EQ(line_selection.click_end, 5);
-  EXPECT_EQ(line_selection.selection_start, 6);
-  EXPECT_EQ(line_selection.selection_end, 10);
-  EXPECT_EQ(shift, 0);
+  internal::StripTokensFromOtherLines(context, span, &tokens);
+  EXPECT_THAT(tokens,
+              ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
 }
 
 TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
-  SelectionWithContext selection;
-  selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+  const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+  const CodepointSpan span = {18, 22};
+  // clang-format off
+  std::vector<Token> tokens = {Token("Fiřst", 0, 5),
+                               Token("Lině", 6, 10),
+                               Token("Sěcond", 11, 17),
+                               Token("Lině", 18, 22),
+                               Token("Thiřd", 23, 28),
+                               Token("Lině", 29, 33)};
+  // clang-format on
 
-  // Keeps the second line.
-  selection.click_start = 11;
-  selection.click_end = 17;
-  selection.selection_start = 18;
-  selection.selection_end = 22;
-
-  SelectionWithContext line_selection;
-  int shift;
-  std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
-
-  EXPECT_EQ(line_selection.context, "Sěcond Lině");
-  EXPECT_EQ(line_selection.click_start, 0);
-  EXPECT_EQ(line_selection.click_end, 6);
-  EXPECT_EQ(line_selection.selection_start, 7);
-  EXPECT_EQ(line_selection.selection_end, 11);
-  EXPECT_EQ(shift, 11);
+  // Keeps the first line.
+  internal::StripTokensFromOtherLines(context, span, &tokens);
+  EXPECT_THAT(tokens, ElementsAreArray(
+                          {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
 }
 
 TEST(FeatureProcessorTest, KeepLineWithClickThird) {
-  SelectionWithContext selection;
-  selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+  const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+  const CodepointSpan span = {24, 33};
+  // clang-format off
+  std::vector<Token> tokens = {Token("Fiřst", 0, 5),
+                               Token("Lině", 6, 10),
+                               Token("Sěcond", 11, 17),
+                               Token("Lině", 18, 22),
+                               Token("Thiřd", 23, 28),
+                               Token("Lině", 29, 33)};
+  // clang-format on
 
-  // Keeps the third line.
-  selection.click_start = 29;
-  selection.click_end = 33;
-  selection.selection_start = 23;
-  selection.selection_end = 28;
-
-  SelectionWithContext line_selection;
-  int shift;
-  std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
-
-  EXPECT_EQ(line_selection.context, "Thiřd Lině");
-  EXPECT_EQ(line_selection.click_start, 6);
-  EXPECT_EQ(line_selection.click_end, 10);
-  EXPECT_EQ(line_selection.selection_start, 0);
-  EXPECT_EQ(line_selection.selection_end, 5);
-  EXPECT_EQ(shift, 23);
+  // Keeps the first line.
+  internal::StripTokensFromOtherLines(context, span, &tokens);
+  EXPECT_THAT(tokens, ElementsAreArray(
+                          {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
 }
 
 TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
-  SelectionWithContext selection;
-  selection.context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
+  const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
+  const CodepointSpan span = {18, 22};
+  // clang-format off
+  std::vector<Token> tokens = {Token("Fiřst", 0, 5),
+                               Token("Lině", 6, 10),
+                               Token("Sěcond", 11, 17),
+                               Token("Lině", 18, 22),
+                               Token("Thiřd", 23, 28),
+                               Token("Lině", 29, 33)};
+  // clang-format on
 
-  // Keeps the second line.
-  selection.click_start = 11;
-  selection.click_end = 17;
-  selection.selection_start = 18;
-  selection.selection_end = 22;
-
-  SelectionWithContext line_selection;
-  int shift;
-  std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
-
-  EXPECT_EQ(line_selection.context, "Sěcond Lině");
-  EXPECT_EQ(line_selection.click_start, 0);
-  EXPECT_EQ(line_selection.click_end, 6);
-  EXPECT_EQ(line_selection.selection_start, 7);
-  EXPECT_EQ(line_selection.selection_end, 11);
-  EXPECT_EQ(shift, 11);
+  // Keeps the first line.
+  internal::StripTokensFromOtherLines(context, span, &tokens);
+  EXPECT_THAT(tokens, ElementsAreArray(
+                          {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
 }
 
 TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
-  SelectionWithContext selection;
-  selection.context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+  const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
+  const CodepointSpan span = {5, 23};
+  // clang-format off
+  std::vector<Token> tokens = {Token("Fiřst", 0, 5),
+                               Token("Lině", 6, 10),
+                               Token("Sěcond", 18, 23),
+                               Token("Lině", 19, 23),
+                               Token("Thiřd", 23, 28),
+                               Token("Lině", 29, 33)};
+  // clang-format on
 
-  // Selects across lines, so KeepLine should not do any changes.
-  selection.click_start = 6;
-  selection.click_end = 17;
-  selection.selection_start = 0;
-  selection.selection_end = 22;
-
-  SelectionWithContext line_selection;
-  int shift;
-  std::tie(line_selection, shift) = internal::ExtractLineWithClick(selection);
-
-  EXPECT_EQ(line_selection.context, "Fiřst Lině\nSěcond Lině\nThiřd Lině");
-  EXPECT_EQ(line_selection.click_start, 6);
-  EXPECT_EQ(line_selection.click_end, 17);
-  EXPECT_EQ(line_selection.selection_start, 0);
-  EXPECT_EQ(line_selection.selection_end, 22);
-  EXPECT_EQ(shift, 0);
+  // Keeps the first line.
+  internal::StripTokensFromOtherLines(context, span, &tokens);
+  EXPECT_THAT(tokens, ElementsAreArray(
+                          {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
+                           Token("Sěcond", 18, 23), Token("Lině", 19, 23),
+                           Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
 }
 
 TEST(FeatureProcessorTest, GetFeaturesWithContextDropout) {
@@ -231,14 +213,6 @@
   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
   FeatureProcessor feature_processor(options);
 
-  SelectionWithContext selection_with_context;
-  selection_with_context.context = "1 2 3 c o n t e x t X c o n t e x t 1 2 3";
-  // Selection and click indices of the X in the middle:
-  selection_with_context.selection_start = 20;
-  selection_with_context.selection_end = 21;
-  selection_with_context.click_start = 20;
-  selection_with_context.click_end = 21;
-
   // Test that two subsequent runs with random context dropout produce
   // different features.
   feature_processor.SetRandom(new std::mt19937);
@@ -251,13 +225,13 @@
   CodepointSpan selection_codepoint_label;
   int classification_label;
   EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
-      selection_with_context, &features, &extra_features,
-      &selection_label_spans, &selection_label, &selection_codepoint_label,
-      &classification_label));
+      "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
+      &features, &extra_features, &selection_label_spans, &selection_label,
+      &selection_codepoint_label, &classification_label));
   EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
-      selection_with_context, &features2, &extra_features,
-      &selection_label_spans, &selection_label, &selection_codepoint_label,
-      &classification_label));
+      "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
+      &features2, &extra_features, &selection_label_spans, &selection_label,
+      &selection_codepoint_label, &classification_label));
 
   EXPECT_NE(features, features2);
 }
@@ -276,14 +250,6 @@
   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
   FeatureProcessor feature_processor(options);
 
-  SelectionWithContext selection_with_context;
-  selection_with_context.context = "1 2 3 c o n t e x t X c o n t e x t 1 2 3";
-  // Selection and click indices of the X in the middle:
-  selection_with_context.selection_start = 20;
-  selection_with_context.selection_end = 21;
-  selection_with_context.click_start = 20;
-  selection_with_context.click_end = 21;
-
   std::vector<std::vector<std::pair<int, float>>> features;
   std::vector<float> extra_features;
   std::vector<CodepointSpan> selection_label_spans;
@@ -291,19 +257,14 @@
   CodepointSpan selection_codepoint_label;
   int classification_label;
   EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
-      selection_with_context, &features, &extra_features,
-      &selection_label_spans, &selection_label, &selection_codepoint_label,
-      &classification_label));
+      "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
+      &features, &extra_features, &selection_label_spans, &selection_label,
+      &selection_codepoint_label, &classification_label));
   EXPECT_EQ(19, features.size());
 
   // Should pad the string.
-  selection_with_context.context = "X";
-  selection_with_context.selection_start = 0;
-  selection_with_context.selection_end = 1;
-  selection_with_context.click_start = 0;
-  selection_with_context.click_end = 1;
   EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
-      selection_with_context, &features, &extra_features,
+      "X", {0, 1}, {0, 1}, "", &features, &extra_features,
       &selection_label_spans, &selection_label, &selection_codepoint_label,
       &classification_label));
   EXPECT_EQ(19, features.size());
@@ -432,5 +393,67 @@
       }));
 }
 
+TEST(FeatureProcessorTest, CenterTokenFromClick) {
+  int token_index;
+
+  // Exactly aligned indices.
+  token_index = internal::CenterTokenFromClick(
+      {6, 11}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
+                Token("heře!", 12, 17, false)});
+  EXPECT_EQ(token_index, 1);
+
+  // Click is contained in a token.
+  token_index = internal::CenterTokenFromClick(
+      {13, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
+                 Token("heře!", 12, 17, false)});
+  EXPECT_EQ(token_index, 2);
+
+  // Click spans two tokens.
+  token_index = internal::CenterTokenFromClick(
+      {6, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
+                Token("heře!", 12, 17, false)});
+  EXPECT_EQ(token_index, kInvalidIndex);
+}
+
+TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
+  SelectionWithContext selection;
+  int token_index;
+
+  // Selection of length 3. Exactly aligned indices.
+  token_index = internal::CenterTokenFromMiddleOfSelection(
+      {7, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
+                Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
+                Token("Token5", 28, 34, false)});
+  EXPECT_EQ(token_index, 2);
+
+  // Selection of length 1 token. Exactly aligned indices.
+  token_index = internal::CenterTokenFromMiddleOfSelection(
+      {21, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
+                 Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
+                 Token("Token5", 28, 34, false)});
+  EXPECT_EQ(token_index, 3);
+
+  // Selection marks sub-token range, with no tokens in it.
+  token_index = internal::CenterTokenFromMiddleOfSelection(
+      {29, 33}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
+                 Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
+                 Token("Token5", 28, 34, false)});
+  EXPECT_EQ(token_index, kInvalidIndex);
+
+  // Selection of length 2. Sub-token indices.
+  token_index = internal::CenterTokenFromMiddleOfSelection(
+      {3, 25}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
+                Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
+                Token("Token5", 28, 34, false)});
+  EXPECT_EQ(token_index, 1);
+
+  // Selection of length 1. Sub-token indices.
+  token_index = internal::CenterTokenFromMiddleOfSelection(
+      {22, 34}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
+                 Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
+                 Token("Token5", 28, 34, false)});
+  EXPECT_EQ(token_index, 4);
+}
+
 }  // namespace
 }  // namespace libtextclassifier

diff --git a/tests/lang-id_test.cc b/tests/lang-id_test.cc
index 39aed63..faf78c6 100644
--- a/tests/lang-id_test.cc
+++ b/tests/lang-id_test.cc

@@ -30,7 +30,7 @@
 namespace {
 
 std::string GetModelPath() {
-  return "tests/testdata/langid.model";
+  return TEST_DATA_DIR "langid.model";
 }
 
 // Creates a LangId with default model.  Passes ownership to

diff --git a/tests/testdata/smartselection.model b/tests/testdata/smartselection.model
index 8972a2e..2b96f42 100644
--- a/tests/testdata/smartselection.model
+++ b/tests/testdata/smartselection.model
Binary files differ

diff --git a/tests/text-classification-model_test.cc b/tests/text-classification-model_test.cc
index a4545cd..d588fc7 100644
--- a/tests/text-classification-model_test.cc
+++ b/tests/text-classification-model_test.cc

@@ -28,28 +28,30 @@
 namespace {
 
 std::string GetModelPath() {
-  return "tests/testdata/smartselection.model";
+  return TEST_DATA_DIR "smartselection.model";
 }
 
 TEST(TextClassificationModelTest, SuggestSelection) {
   const std::string model_path = GetModelPath();
   int fd = open(model_path.c_str(), O_RDONLY);
-  std::unique_ptr<TextClassificationModel> ff_model(
+  std::unique_ptr<TextClassificationModel> model(
       new TextClassificationModel(fd));
   close(fd);
 
-  std::tuple<int, int> selection;
-  selection = ff_model->SuggestSelection(
-      "this afternoon Barack Obama gave a speech at", {15, 21});
-  EXPECT_EQ(15, std::get<0>(selection));
-  EXPECT_EQ(27, std::get<1>(selection));
+  EXPECT_EQ(model->SuggestSelection(
+                "this afternoon Barack Obama gave a speech at", {15, 21}),
+            std::make_pair(15, 27));
 
   // Try passing whole string.
-  selection =
-      ff_model->SuggestSelection("350 Third Street, Cambridge", {0, 27});
   // If more than 1 token is specified, we should return back what entered.
-  EXPECT_EQ(0, std::get<0>(selection));
-  EXPECT_EQ(27, std::get<1>(selection));
+  EXPECT_EQ(model->SuggestSelection("350 Third Street, Cambridge", {0, 27}),
+            std::make_pair(0, 27));
+
+  // Single letter.
+  EXPECT_EQ(std::make_pair(0, 1), model->SuggestSelection("a", {0, 1}));
+
+  // Single word.
+  EXPECT_EQ(std::make_pair(0, 4), model->SuggestSelection("asdf", {0, 4}));
 }
 
 TEST(TextClassificationModelTest, SuggestSelectionsAreSymmetric) {
@@ -183,6 +185,10 @@
 namespace {
 
 std::string FindBestResult(std::vector<std::pair<std::string, float>> results) {
+  if (results.empty()) {
+    return "<INVALID RESULTS>";
+  }
+
   std::sort(results.begin(), results.end(),
             [](const std::pair<std::string, float> a,
                const std::pair<std::string, float> b) {
@@ -211,6 +217,29 @@
                          "Call me at (800) 123-456 today", {11, 24})));
   EXPECT_EQ("url", FindBestResult(model->ClassifyText(
                        "Visit www.google.com every today!", {6, 20})));
+
+  // More lines.
+  EXPECT_EQ("other",
+            FindBestResult(model->ClassifyText(
+                "this afternoon Barack Obama gave a speech at|Visit "
+                "www.google.com every today!|Call me at (800) 123-456 today.",
+                {15, 27})));
+  EXPECT_EQ("url",
+            FindBestResult(model->ClassifyText(
+                "this afternoon Barack Obama gave a speech at|Visit "
+                "www.google.com every today!|Call me at (800) 123-456 today.",
+                {51, 65})));
+  EXPECT_EQ("phone",
+            FindBestResult(model->ClassifyText(
+                "this afternoon Barack Obama gave a speech at|Visit "
+                "www.google.com every today!|Call me at (800) 123-456 today.",
+                {90, 103})));
+
+  // Single word.
+  EXPECT_EQ("other", FindBestResult(model->ClassifyText("Obama", {0, 5})));
+  EXPECT_EQ("other", FindBestResult(model->ClassifyText("asdf", {0, 4})));
+  EXPECT_EQ("<INVALID RESULTS>",
+            FindBestResult(model->ClassifyText("asdf", {0, 0})));
 }
 
 }  // namespace
commit	be876dc13b917662ad8688da10e688cdaf6ce051	[log] [tgz]
author	Matt Sharifi <mns@google.com>	Fri Mar 17 17:02:43 2017 +0100
committer	Matt Sharifi <mns@google.com>	Mon Mar 20 12:55:43 2017 +0100
tree	222b245f9af830fd8c8187967d9a6372e43d45fb
parent	72282c6d2a320b89df4a3ed75df027f67c4ee6f3 [diff]