Blame - actions/feature-processor.cc - platform/external/libtextclassifier

blob: d0b2072168bae3dba941fd31eeed9310a21e3380 [file] [log] [blame]

Tony Mak	ad2e22d	2019-03-20 17:35:13 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2018 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include "actions/feature-processor.h"
				18
				19	namespace libtextclassifier3 {
				20	namespace {
				21	TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
				22	const ActionsTokenFeatureProcessorOptions* const options) {
				23	TokenFeatureExtractorOptions extractor_options;
				24	extractor_options.num_buckets = options->num_buckets();
				25	if (options->chargram_orders() != nullptr) {
				26	for (int order : *options->chargram_orders()) {
				27	extractor_options.chargram_orders.push_back(order);
				28	}
				29	}
				30	extractor_options.max_word_length = options->max_token_length();
				31	extractor_options.extract_case_feature = options->extract_case_feature();
				32	extractor_options.unicode_aware_features = options->unicode_aware_features();
				33	extractor_options.extract_selection_mask_feature = false;
				34	if (options->regexp_features() != nullptr) {
				35	for (const auto& regexp_feauture : *options->regexp_features()) {
				36	extractor_options.regexp_features.push_back(regexp_feauture->str());
				37	}
				38	}
				39	extractor_options.remap_digits = options->remap_digits();
				40	extractor_options.lowercase_tokens = options->lowercase_tokens();
				41	return extractor_options;
				42	}
				43	} // namespace
				44
				45	std::unique_ptr<Tokenizer> CreateTokenizer(
				46	const ActionsTokenizerOptions* options, const UniLib* unilib) {
				47	std::vector<const TokenizationCodepointRange*> codepoint_config;
				48	if (options->tokenization_codepoint_config() != nullptr) {
				49	codepoint_config.insert(codepoint_config.end(),
				50	options->tokenization_codepoint_config()->begin(),
				51	options->tokenization_codepoint_config()->end());
				52	}
				53	std::vector<const CodepointRange*> internal_codepoint_config;
				54	if (options->internal_tokenizer_codepoint_ranges() != nullptr) {
				55	internal_codepoint_config.insert(
				56	internal_codepoint_config.end(),
				57	options->internal_tokenizer_codepoint_ranges()->begin(),
				58	options->internal_tokenizer_codepoint_ranges()->end());
				59	}
				60	const bool tokenize_on_script_change =
				61	options->tokenization_codepoint_config() != nullptr &&
				62	options->tokenize_on_script_change();
				63	return std::unique_ptr<Tokenizer>(new Tokenizer(
				64	options->type(), unilib, codepoint_config, internal_codepoint_config,
				65	tokenize_on_script_change, options->icu_preserve_whitespace_tokens()));
				66	}
				67
				68	ActionsFeatureProcessor::ActionsFeatureProcessor(
				69	const ActionsTokenFeatureProcessorOptions* options, const UniLib* unilib)
				70	: options_(options),
				71	tokenizer_(CreateTokenizer(options->tokenizer_options(), unilib)),
				72	token_feature_extractor_(BuildTokenFeatureExtractorOptions(options),
				73	*unilib) {}
				74
				75	int ActionsFeatureProcessor::GetTokenEmbeddingSize() const {
				76	return options_->embedding_size() +
				77	token_feature_extractor_.DenseFeaturesCount();
				78	}
				79
Tony Mak	83d2de6	2019-04-10 16:12:15 +0100	[diff] [blame]	80	bool ActionsFeatureProcessor::AppendFeatures(
				81	const std::vector<int>& sparse_features,
				82	const std::vector<float>& dense_features,
				83	const EmbeddingExecutor* embedding_executor,
Tony Mak	ad2e22d	2019-03-20 17:35:13 +0000	[diff] [blame]	84	std::vector<float>* output_features) const {
Tony Mak	ad2e22d	2019-03-20 17:35:13 +0000	[diff] [blame]	85	// Embed the sparse features, appending them directly to the output.
				86	const int embedding_size = options_->embedding_size();
				87	output_features->resize(output_features->size() + embedding_size);
				88	float* output_features_end =
				89	output_features->data() + output_features->size();
				90	if (!embedding_executor->AddEmbedding(
				91	TensorView<int>(sparse_features.data(),
				92	{static_cast<int>(sparse_features.size())}),
				93	/dest=/output_features_end - embedding_size,
				94	/dest_size=/embedding_size)) {
				95	TC3_LOG(ERROR) << "Could not embed token's sparse features.";
				96	return false;
				97	}
				98
				99	// Append the dense features to the output.
				100	output_features->insert(output_features->end(), dense_features.begin(),
				101	dense_features.end());
				102	return true;
				103	}
				104
				105	bool ActionsFeatureProcessor::AppendTokenFeatures(
Tony Mak	83d2de6	2019-04-10 16:12:15 +0100	[diff] [blame]	106	const Token& token, const EmbeddingExecutor* embedding_executor,
				107	std::vector<float>* output_features) const {
				108	// Extract the sparse and dense features.
				109	std::vector<int> sparse_features;
				110	std::vector<float> dense_features;
				111	if (!token_feature_extractor_.Extract(token, /(unused) is_in_span=/false,
				112	&sparse_features, &dense_features)) {
				113	TC3_LOG(ERROR) << "Could not extract token's features.";
				114	return false;
				115	}
				116	return AppendFeatures(sparse_features, dense_features, embedding_executor,
				117	output_features);
				118	}
				119
				120	bool ActionsFeatureProcessor::AppendTokenFeatures(
Tony Mak	ad2e22d	2019-03-20 17:35:13 +0000	[diff] [blame]	121	const std::vector<Token>& tokens,
				122	const EmbeddingExecutor* embedding_executor,
				123	std::vector<float>* output_features) const {
				124	for (const Token& token : tokens) {
				125	if (!AppendTokenFeatures(token, embedding_executor, output_features)) {
				126	return false;
				127	}
				128	}
				129	return true;
				130	}
				131
				132	} // namespace libtextclassifier3