actions/feature-processor.cc - platform/external/libtextclassifier - Gitiles

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "actions/feature-processor.h"

 namespace libtextclassifier3 {
 namespace {
 TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
     const ActionsTokenFeatureProcessorOptions* const options) {
   TokenFeatureExtractorOptions extractor_options;
   extractor_options.num_buckets = options->num_buckets();
   if (options->chargram_orders() != nullptr) {
     for (int order : *options->chargram_orders()) {
       extractor_options.chargram_orders.push_back(order);
     }
   }
   extractor_options.max_word_length = options->max_token_length();
   extractor_options.extract_case_feature = options->extract_case_feature();
   extractor_options.unicode_aware_features = options->unicode_aware_features();
   extractor_options.extract_selection_mask_feature = false;
   if (options->regexp_features() != nullptr) {
     for (const auto& regexp_feauture : *options->regexp_features()) {
       extractor_options.regexp_features.push_back(regexp_feauture->str());
     }
   }
   extractor_options.remap_digits = options->remap_digits();
   extractor_options.lowercase_tokens = options->lowercase_tokens();
   return extractor_options;
 }
 }  // namespace

 std::unique_ptr<Tokenizer> CreateTokenizer(
     const ActionsTokenizerOptions* options, const UniLib* unilib) {
   std::vector<const TokenizationCodepointRange*> codepoint_config;
   if (options->tokenization_codepoint_config() != nullptr) {
     codepoint_config.insert(codepoint_config.end(),
                             options->tokenization_codepoint_config()->begin(),
                             options->tokenization_codepoint_config()->end());
   }
   std::vector<const CodepointRange*> internal_codepoint_config;
   if (options->internal_tokenizer_codepoint_ranges() != nullptr) {
     internal_codepoint_config.insert(
         internal_codepoint_config.end(),
         options->internal_tokenizer_codepoint_ranges()->begin(),
         options->internal_tokenizer_codepoint_ranges()->end());
   }
   const bool tokenize_on_script_change =
       options->tokenization_codepoint_config() != nullptr &&
       options->tokenize_on_script_change();
   return std::unique_ptr<Tokenizer>(new Tokenizer(
       options->type(), unilib, codepoint_config, internal_codepoint_config,
       tokenize_on_script_change, options->icu_preserve_whitespace_tokens()));
 }

 ActionsFeatureProcessor::ActionsFeatureProcessor(
     const ActionsTokenFeatureProcessorOptions* options, const UniLib* unilib)
     : options_(options),
       tokenizer_(CreateTokenizer(options->tokenizer_options(), unilib)),
       token_feature_extractor_(BuildTokenFeatureExtractorOptions(options),
                                *unilib) {}

 int ActionsFeatureProcessor::GetTokenEmbeddingSize() const {
   return options_->embedding_size() +
          token_feature_extractor_.DenseFeaturesCount();
 }

 bool ActionsFeatureProcessor::AppendTokenFeatures(
     const Token& token, const EmbeddingExecutor* embedding_executor,
     std::vector<float>* output_features) const {
   // Extract the sparse and dense features.
   std::vector<int> sparse_features;
   std::vector<float> dense_features;
   if (!token_feature_extractor_.Extract(token, /*(unused) is_in_span=*/false,
                                         &sparse_features, &dense_features)) {
     TC3_LOG(ERROR) << "Could not extract token's features.";
     return false;
   }

   // Embed the sparse features, appending them directly to the output.
   const int embedding_size = options_->embedding_size();
   output_features->resize(output_features->size() + embedding_size);
   float* output_features_end =
       output_features->data() + output_features->size();
   if (!embedding_executor->AddEmbedding(
           TensorView<int>(sparse_features.data(),
                           {static_cast<int>(sparse_features.size())}),
           /*dest=*/output_features_end - embedding_size,
           /*dest_size=*/embedding_size)) {
     TC3_LOG(ERROR) << "Could not embed token's sparse features.";
     return false;
   }

   // Append the dense features to the output.
   output_features->insert(output_features->end(), dense_features.begin(),
                           dense_features.end());
   return true;
 }

 bool ActionsFeatureProcessor::AppendTokenFeatures(
     const std::vector<Token>& tokens,
     const EmbeddingExecutor* embedding_executor,
     std::vector<float>* output_features) const {
   for (const Token& token : tokens) {
     if (!AppendTokenFeatures(token, embedding_executor, output_features)) {
       return false;
     }
   }
   return true;
 }

 }  // namespace libtextclassifier3
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "actions/feature-processor.h"

	namespace libtextclassifier3 {
	namespace {
	TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
	const ActionsTokenFeatureProcessorOptions* const options) {
	TokenFeatureExtractorOptions extractor_options;
	extractor_options.num_buckets = options->num_buckets();
	if (options->chargram_orders() != nullptr) {
	for (int order : *options->chargram_orders()) {
	extractor_options.chargram_orders.push_back(order);
	}
	}
	extractor_options.max_word_length = options->max_token_length();
	extractor_options.extract_case_feature = options->extract_case_feature();
	extractor_options.unicode_aware_features = options->unicode_aware_features();
	extractor_options.extract_selection_mask_feature = false;
	if (options->regexp_features() != nullptr) {
	for (const auto& regexp_feauture : *options->regexp_features()) {
	extractor_options.regexp_features.push_back(regexp_feauture->str());
	}
	}
	extractor_options.remap_digits = options->remap_digits();
	extractor_options.lowercase_tokens = options->lowercase_tokens();
	return extractor_options;
	}
	} // namespace

	std::unique_ptr<Tokenizer> CreateTokenizer(
	const ActionsTokenizerOptions* options, const UniLib* unilib) {
	std::vector<const TokenizationCodepointRange*> codepoint_config;
	if (options->tokenization_codepoint_config() != nullptr) {
	codepoint_config.insert(codepoint_config.end(),
	options->tokenization_codepoint_config()->begin(),
	options->tokenization_codepoint_config()->end());
	}
	std::vector<const CodepointRange*> internal_codepoint_config;
	if (options->internal_tokenizer_codepoint_ranges() != nullptr) {
	internal_codepoint_config.insert(
	internal_codepoint_config.end(),
	options->internal_tokenizer_codepoint_ranges()->begin(),
	options->internal_tokenizer_codepoint_ranges()->end());
	}
	const bool tokenize_on_script_change =
	options->tokenization_codepoint_config() != nullptr &&
	options->tokenize_on_script_change();
	return std::unique_ptr<Tokenizer>(new Tokenizer(
	options->type(), unilib, codepoint_config, internal_codepoint_config,
	tokenize_on_script_change, options->icu_preserve_whitespace_tokens()));
	}

	ActionsFeatureProcessor::ActionsFeatureProcessor(
	const ActionsTokenFeatureProcessorOptions* options, const UniLib* unilib)
	: options_(options),
	tokenizer_(CreateTokenizer(options->tokenizer_options(), unilib)),
	token_feature_extractor_(BuildTokenFeatureExtractorOptions(options),
	*unilib) {}

	int ActionsFeatureProcessor::GetTokenEmbeddingSize() const {
	return options_->embedding_size() +
	token_feature_extractor_.DenseFeaturesCount();
	}

	bool ActionsFeatureProcessor::AppendTokenFeatures(
	const Token& token, const EmbeddingExecutor* embedding_executor,
	std::vector<float>* output_features) const {
	// Extract the sparse and dense features.
	std::vector<int> sparse_features;
	std::vector<float> dense_features;
	if (!token_feature_extractor_.Extract(token, /(unused) is_in_span=/false,
	&sparse_features, &dense_features)) {
	TC3_LOG(ERROR) << "Could not extract token's features.";
	return false;
	}

	// Embed the sparse features, appending them directly to the output.
	const int embedding_size = options_->embedding_size();
	output_features->resize(output_features->size() + embedding_size);
	float* output_features_end =
	output_features->data() + output_features->size();
	if (!embedding_executor->AddEmbedding(
	TensorView<int>(sparse_features.data(),
	{static_cast<int>(sparse_features.size())}),
	/dest=/output_features_end - embedding_size,
	/dest_size=/embedding_size)) {
	TC3_LOG(ERROR) << "Could not embed token's sparse features.";
	return false;
	}

	// Append the dense features to the output.
	output_features->insert(output_features->end(), dense_features.begin(),
	dense_features.end());
	return true;
	}

	bool ActionsFeatureProcessor::AppendTokenFeatures(
	const std::vector<Token>& tokens,
	const EmbeddingExecutor* embedding_executor,
	std::vector<float>* output_features) const {
	for (const Token& token : tokens) {
	if (!AppendTokenFeatures(token, embedding_executor, output_features)) {
	return false;
	}
	}
	return true;
	}

	} // namespace libtextclassifier3