blob: d0b2072168bae3dba941fd31eeed9310a21e3380 [file] [log] [blame]
Tony Makad2e22d2019-03-20 17:35:13 +00001/*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "actions/feature-processor.h"
18
19namespace libtextclassifier3 {
20namespace {
21TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
22 const ActionsTokenFeatureProcessorOptions* const options) {
23 TokenFeatureExtractorOptions extractor_options;
24 extractor_options.num_buckets = options->num_buckets();
25 if (options->chargram_orders() != nullptr) {
26 for (int order : *options->chargram_orders()) {
27 extractor_options.chargram_orders.push_back(order);
28 }
29 }
30 extractor_options.max_word_length = options->max_token_length();
31 extractor_options.extract_case_feature = options->extract_case_feature();
32 extractor_options.unicode_aware_features = options->unicode_aware_features();
33 extractor_options.extract_selection_mask_feature = false;
34 if (options->regexp_features() != nullptr) {
35 for (const auto& regexp_feauture : *options->regexp_features()) {
36 extractor_options.regexp_features.push_back(regexp_feauture->str());
37 }
38 }
39 extractor_options.remap_digits = options->remap_digits();
40 extractor_options.lowercase_tokens = options->lowercase_tokens();
41 return extractor_options;
42}
43} // namespace
44
45std::unique_ptr<Tokenizer> CreateTokenizer(
46 const ActionsTokenizerOptions* options, const UniLib* unilib) {
47 std::vector<const TokenizationCodepointRange*> codepoint_config;
48 if (options->tokenization_codepoint_config() != nullptr) {
49 codepoint_config.insert(codepoint_config.end(),
50 options->tokenization_codepoint_config()->begin(),
51 options->tokenization_codepoint_config()->end());
52 }
53 std::vector<const CodepointRange*> internal_codepoint_config;
54 if (options->internal_tokenizer_codepoint_ranges() != nullptr) {
55 internal_codepoint_config.insert(
56 internal_codepoint_config.end(),
57 options->internal_tokenizer_codepoint_ranges()->begin(),
58 options->internal_tokenizer_codepoint_ranges()->end());
59 }
60 const bool tokenize_on_script_change =
61 options->tokenization_codepoint_config() != nullptr &&
62 options->tokenize_on_script_change();
63 return std::unique_ptr<Tokenizer>(new Tokenizer(
64 options->type(), unilib, codepoint_config, internal_codepoint_config,
65 tokenize_on_script_change, options->icu_preserve_whitespace_tokens()));
66}
67
68ActionsFeatureProcessor::ActionsFeatureProcessor(
69 const ActionsTokenFeatureProcessorOptions* options, const UniLib* unilib)
70 : options_(options),
71 tokenizer_(CreateTokenizer(options->tokenizer_options(), unilib)),
72 token_feature_extractor_(BuildTokenFeatureExtractorOptions(options),
73 *unilib) {}
74
75int ActionsFeatureProcessor::GetTokenEmbeddingSize() const {
76 return options_->embedding_size() +
77 token_feature_extractor_.DenseFeaturesCount();
78}
79
Tony Mak83d2de62019-04-10 16:12:15 +010080bool ActionsFeatureProcessor::AppendFeatures(
81 const std::vector<int>& sparse_features,
82 const std::vector<float>& dense_features,
83 const EmbeddingExecutor* embedding_executor,
Tony Makad2e22d2019-03-20 17:35:13 +000084 std::vector<float>* output_features) const {
Tony Makad2e22d2019-03-20 17:35:13 +000085 // Embed the sparse features, appending them directly to the output.
86 const int embedding_size = options_->embedding_size();
87 output_features->resize(output_features->size() + embedding_size);
88 float* output_features_end =
89 output_features->data() + output_features->size();
90 if (!embedding_executor->AddEmbedding(
91 TensorView<int>(sparse_features.data(),
92 {static_cast<int>(sparse_features.size())}),
93 /*dest=*/output_features_end - embedding_size,
94 /*dest_size=*/embedding_size)) {
95 TC3_LOG(ERROR) << "Could not embed token's sparse features.";
96 return false;
97 }
98
99 // Append the dense features to the output.
100 output_features->insert(output_features->end(), dense_features.begin(),
101 dense_features.end());
102 return true;
103}
104
105bool ActionsFeatureProcessor::AppendTokenFeatures(
Tony Mak83d2de62019-04-10 16:12:15 +0100106 const Token& token, const EmbeddingExecutor* embedding_executor,
107 std::vector<float>* output_features) const {
108 // Extract the sparse and dense features.
109 std::vector<int> sparse_features;
110 std::vector<float> dense_features;
111 if (!token_feature_extractor_.Extract(token, /*(unused) is_in_span=*/false,
112 &sparse_features, &dense_features)) {
113 TC3_LOG(ERROR) << "Could not extract token's features.";
114 return false;
115 }
116 return AppendFeatures(sparse_features, dense_features, embedding_executor,
117 output_features);
118}
119
120bool ActionsFeatureProcessor::AppendTokenFeatures(
Tony Makad2e22d2019-03-20 17:35:13 +0000121 const std::vector<Token>& tokens,
122 const EmbeddingExecutor* embedding_executor,
123 std::vector<float>* output_features) const {
124 for (const Token& token : tokens) {
125 if (!AppendTokenFeatures(token, embedding_executor, output_features)) {
126 return false;
127 }
128 }
129 return true;
130}
131
132} // namespace libtextclassifier3