Fixes crashes by making native library thread-safe, makes Annotate calls much faster by re-using tokens, fixes default values in enums in FlatBuffer schema. Test: bit FrameworksCoreTests:android.view.textclassifier.TextClassificationManagerTest Test: bit CtsViewTestCases:android.view.textclassifier.cts.TextClassificationManagerTest Bug: 74193987 Bug: 68239358 Change-Id: Ic5ca42b628280bece59d31203748072084ac452c (cherry picked from commit 2191547d7109587d73077f9d4818c691f7d7dafb) Merged-In: Ic5ca42b628280bece59d31203748072084ac452c

commit: ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40b [log] [tgz]
author: Lukas Zilka <zilka@google.com> Thu Mar 08 14:48:21 2018 +0100
committer: Lukas Zilka <zilka@google.com> Tue Mar 13 11:12:47 2018 +0000
tree: c8d542bc273a2afbd4858a4d61eea1daf87175c8
parent: df710db0da01c5f470ead4f7518ba142c4117dae [diff] [blame]
diff --git a/feature-processor.h b/feature-processor.h
index e6f33d6..553bd1e 100644
--- a/feature-processor.h
+++ b/feature-processor.h

@@ -86,6 +86,13 @@
 // Takes care of preparing features for the span prediction model.
 class FeatureProcessor {
  public:
+  // A cache mapping codepoint spans to embedded tokens features. An instance
+  // can be provided to multiple calls to ExtractFeatures() operating on the
+  // same context (the same codepoint spans corresponding to the same tokens),
+  // as an optimization. Note that the tokenizations do not have to be
+  // identical.
+  typedef std::map<CodepointSpan, std::vector<float>> EmbeddingCache;
+
   // If unilib is nullptr, will create and own an instance of a UniLib,
   // otherwise will use what's passed in.
   explicit FeatureProcessor(const FeatureProcessorOptions* options,
@@ -139,24 +146,25 @@
 
   const FeatureProcessorOptions* GetOptions() const { return options_; }
 
-  // Tokenizes the context and input span, and finds the click position.
-  void TokenizeAndFindClick(const std::string& context,
-                            CodepointSpan input_span,
-                            bool only_use_line_with_click,
-                            std::vector<Token>* tokens, int* click_pos) const;
+  // Retokenizes the context and input span, and finds the click position.
+  // Depending on the options, might modify tokens (split them or remove them).
+  void RetokenizeAndFindClick(const std::string& context,
+                              CodepointSpan input_span,
+                              bool only_use_line_with_click,
+                              std::vector<Token>* tokens, int* click_pos) const;
 
   // Same as above but takes UnicodeText.
-  void TokenizeAndFindClick(const UnicodeText& context_unicode,
-                            CodepointSpan input_span,
-                            bool only_use_line_with_click,
-                            std::vector<Token>* tokens, int* click_pos) const;
+  void RetokenizeAndFindClick(const UnicodeText& context_unicode,
+                              CodepointSpan input_span,
+                              bool only_use_line_with_click,
+                              std::vector<Token>* tokens, int* click_pos) const;
 
   // Extracts features as a CachedFeatures object that can be used for repeated
   // inference over token spans in the given context.
   bool ExtractFeatures(const std::vector<Token>& tokens, TokenSpan token_span,
                        CodepointSpan selection_span_for_feature,
-                       EmbeddingExecutor* embedding_executor,
-                       int feature_vector_size,
+                       const EmbeddingExecutor* embedding_executor,
+                       EmbeddingCache* embedding_cache, int feature_vector_size,
                        std::unique_ptr<CachedFeatures>* cached_features) const;
 
   // Fills selection_label_spans with CodepointSpans that correspond to the
@@ -280,6 +288,15 @@
                                  CodepointSpan span,
                                  std::vector<Token>* tokens) const;
 
+  // Extracts the features of a token and appends them to the output vector.
+  // Uses the embedding cache to to avoid re-extracting the re-embedding the
+  // sparse features for the same token.
+  bool AppendTokenFeaturesWithCache(const Token& token,
+                                    CodepointSpan selection_span_for_feature,
+                                    const EmbeddingExecutor* embedding_executor,
+                                    EmbeddingCache* embedding_cache,
+                                    std::vector<float>* output_features) const;
+
  private:
   std::unique_ptr<UniLib> owned_unilib_;
   const UniLib* unilib_;
commit	ba849e7b63cdf4a38e6ef1a5a9ffd60567d7c40b	[log] [tgz]
author	Lukas Zilka <zilka@google.com>	Thu Mar 08 14:48:21 2018 +0100
committer	Lukas Zilka <zilka@google.com>	Tue Mar 13 11:12:47 2018 +0000
tree	c8d542bc273a2afbd4858a4d61eea1daf87175c8
parent	df710db0da01c5f470ead4f7518ba142c4117dae [diff] [blame]