blob: a1a991901c22b6a25940e409a59562a263d6c14a [file] [log] [blame]
Matt Sharifibda09f12017-03-10 12:29:15 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_
18#define LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_
19
Lukas Zilkad3bc59a2017-04-03 17:32:27 +020020#include <memory>
Matt Sharifibda09f12017-03-10 12:29:15 +010021#include <vector>
22
23#include "base.h"
24#include "smartselect/types.h"
Lukas Zilkad3bc59a2017-04-03 17:32:27 +020025#include "unicode/regex.h"
Matt Sharifibda09f12017-03-10 12:29:15 +010026
27namespace libtextclassifier {
28
29struct TokenFeatureExtractorOptions {
30 // Number of buckets used for hashing charactergrams.
Lukas Zilkad3bc59a2017-04-03 17:32:27 +020031 int num_buckets = 0;
Matt Sharifibda09f12017-03-10 12:29:15 +010032
33 // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
34 // character trigrams etc.
35 std::vector<int> chargram_orders;
36
37 // Whether to extract the token case feature.
Lukas Zilkad3bc59a2017-04-03 17:32:27 +020038 bool extract_case_feature = false;
39
40 // If true, will use the unicode-aware functionality for extracting features.
41 bool unicode_aware_features = false;
Matt Sharifibda09f12017-03-10 12:29:15 +010042
43 // Whether to extract the selection mask feature.
Lukas Zilkad3bc59a2017-04-03 17:32:27 +020044 bool extract_selection_mask_feature = false;
45
46 // Regexp features to extract.
47 std::vector<std::string> regexp_features;
48
49 // Whether to remap digits to a single number.
50 bool remap_digits = false;
51
52 // Maximum length of a word.
53 int max_word_length = 20;
Matt Sharifibda09f12017-03-10 12:29:15 +010054};
55
56class TokenFeatureExtractor {
57 public:
Lukas Zilkad3bc59a2017-04-03 17:32:27 +020058 explicit TokenFeatureExtractor(const TokenFeatureExtractorOptions& options);
Matt Sharifibda09f12017-03-10 12:29:15 +010059
60 // Extracts features from a token.
61 // - sparse_features are indices into a sparse feature vector of size
62 // options.num_buckets which are set to 1.0 (others are implicitly 0.0).
63 // - dense_features are values of a dense feature vector of size 0-2
64 // (depending on the options) for the token
65 bool Extract(const Token& token, std::vector<int>* sparse_features,
66 std::vector<float>* dense_features) const;
67
68 // Convenience method that sequentially applies Extract to each Token.
69 bool Extract(const std::vector<Token>& tokens,
70 std::vector<std::vector<int>>* sparse_features,
71 std::vector<std::vector<float>>* dense_features) const;
72
73 protected:
74 // Hashes given token to given number of buckets.
75 int HashToken(const std::string& token) const;
76
77 // Extracts the charactergram features from the token.
78 std::vector<int> ExtractCharactergramFeatures(const Token& token) const;
79
Lukas Zilkad3bc59a2017-04-03 17:32:27 +020080 // Extracts the charactergram features from the token in a non-unicode-aware
81 // way.
82 std::vector<int> ExtractCharactergramFeaturesAscii(const Token& token) const;
83
84 // Extracts the charactergram features from the token in a unicode-aware way.
85 std::vector<int> ExtractCharactergramFeaturesUnicode(
86 const Token& token) const;
87
Matt Sharifibda09f12017-03-10 12:29:15 +010088 private:
89 TokenFeatureExtractorOptions options_;
Lukas Zilkad3bc59a2017-04-03 17:32:27 +020090
91 std::vector<std::unique_ptr<icu::RegexPattern>> regex_patterns_;
Matt Sharifibda09f12017-03-10 12:29:15 +010092};
93
94} // namespace libtextclassifier
95
96#endif // LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_