Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2017 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 17 | #include "token-feature-extractor.h" |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 18 | |
Lukas Zilka | e5ea2ab | 2017-10-11 10:50:05 +0200 | [diff] [blame] | 19 | #include <cctype> |
Matt Sharifi | deb722d | 2017-04-24 13:30:47 +0200 | [diff] [blame] | 20 | #include <string> |
| 21 | |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 22 | #include "util/base/logging.h" |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 23 | #include "util/hash/farmhash.h" |
Lukas Zilka | 26e8c2e | 2017-04-06 15:54:24 +0200 | [diff] [blame] | 24 | #include "util/strings/stringpiece.h" |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 25 | #include "util/utf8/unicodetext.h" |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 26 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 27 | namespace libtextclassifier2 { |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 28 | |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 29 | namespace { |
| 30 | |
Matt Sharifi | deb722d | 2017-04-24 13:30:47 +0200 | [diff] [blame] | 31 | std::string RemapTokenAscii(const std::string& token, |
| 32 | const TokenFeatureExtractorOptions& options) { |
| 33 | if (!options.remap_digits && !options.lowercase_tokens) { |
| 34 | return token; |
| 35 | } |
| 36 | |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 37 | std::string copy = token; |
| 38 | for (int i = 0; i < token.size(); ++i) { |
Matt Sharifi | deb722d | 2017-04-24 13:30:47 +0200 | [diff] [blame] | 39 | if (options.remap_digits && isdigit(copy[i])) { |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 40 | copy[i] = '0'; |
| 41 | } |
Matt Sharifi | deb722d | 2017-04-24 13:30:47 +0200 | [diff] [blame] | 42 | if (options.lowercase_tokens) { |
| 43 | copy[i] = tolower(copy[i]); |
| 44 | } |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 45 | } |
| 46 | return copy; |
| 47 | } |
| 48 | |
Matt Sharifi | deb722d | 2017-04-24 13:30:47 +0200 | [diff] [blame] | 49 | void RemapTokenUnicode(const std::string& token, |
| 50 | const TokenFeatureExtractorOptions& options, |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 51 | const UniLib& unilib, UnicodeText* remapped) { |
Matt Sharifi | deb722d | 2017-04-24 13:30:47 +0200 | [diff] [blame] | 52 | if (!options.remap_digits && !options.lowercase_tokens) { |
| 53 | // Leave remapped untouched. |
| 54 | return; |
| 55 | } |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 56 | |
| 57 | UnicodeText word = UTF8ToUnicodeText(token, /*do_copy=*/false); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 58 | remapped->clear(); |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 59 | for (auto it = word.begin(); it != word.end(); ++it) { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 60 | if (options.remap_digits && unilib.IsDigit(*it)) { |
| 61 | remapped->AppendCodepoint('0'); |
Matt Sharifi | deb722d | 2017-04-24 13:30:47 +0200 | [diff] [blame] | 62 | } else if (options.lowercase_tokens) { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 63 | remapped->AppendCodepoint(unilib.ToLower(*it)); |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 64 | } else { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 65 | remapped->AppendCodepoint(*it); |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 66 | } |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | } // namespace |
| 71 | |
| 72 | TokenFeatureExtractor::TokenFeatureExtractor( |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 73 | const TokenFeatureExtractorOptions& options, const UniLib& unilib) |
| 74 | : options_(options), unilib_(unilib) { |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 75 | for (const std::string& pattern : options.regexp_features) { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 76 | regex_patterns_.push_back(std::unique_ptr<UniLib::RegexPattern>( |
| 77 | unilib_.CreateRegexPattern(pattern))); |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 78 | } |
| 79 | } |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 80 | |
Lukas Zilka | 26e8c2e | 2017-04-06 15:54:24 +0200 | [diff] [blame] | 81 | int TokenFeatureExtractor::HashToken(StringPiece token) const { |
Lukas Zilka | e5ea2ab | 2017-10-11 10:50:05 +0200 | [diff] [blame] | 82 | if (options_.allowed_chargrams.empty()) { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 83 | return tc2farmhash::Fingerprint64(token) % options_.num_buckets; |
Lukas Zilka | e5ea2ab | 2017-10-11 10:50:05 +0200 | [diff] [blame] | 84 | } else { |
| 85 | // Padding and out-of-vocabulary tokens have extra buckets reserved because |
| 86 | // they are special and important tokens, and we don't want them to share |
| 87 | // embedding with other charactergrams. |
| 88 | // TODO(zilka): Experimentally verify. |
| 89 | const int kNumExtraBuckets = 2; |
| 90 | const std::string token_string = token.ToString(); |
| 91 | if (token_string == "<PAD>") { |
| 92 | return 1; |
| 93 | } else if (options_.allowed_chargrams.find(token_string) == |
| 94 | options_.allowed_chargrams.end()) { |
| 95 | return 0; // Out-of-vocabulary. |
| 96 | } else { |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 97 | return (tc2farmhash::Fingerprint64(token) % |
Lukas Zilka | e5ea2ab | 2017-10-11 10:50:05 +0200 | [diff] [blame] | 98 | (options_.num_buckets - kNumExtraBuckets)) + |
| 99 | kNumExtraBuckets; |
| 100 | } |
| 101 | } |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 102 | } |
| 103 | |
| 104 | std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures( |
| 105 | const Token& token) const { |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 106 | if (options_.unicode_aware_features) { |
| 107 | return ExtractCharactergramFeaturesUnicode(token); |
| 108 | } else { |
| 109 | return ExtractCharactergramFeaturesAscii(token); |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesAscii( |
| 114 | const Token& token) const { |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 115 | std::vector<int> result; |
Lukas Zilka | 26e8c2e | 2017-04-06 15:54:24 +0200 | [diff] [blame] | 116 | if (token.is_padding || token.value.empty()) { |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 117 | result.push_back(HashToken("<PAD>")); |
| 118 | } else { |
Matt Sharifi | deb722d | 2017-04-24 13:30:47 +0200 | [diff] [blame] | 119 | const std::string word = RemapTokenAscii(token.value, options_); |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 120 | |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 121 | // Trim words that are over max_word_length characters. |
| 122 | const int max_word_length = options_.max_word_length; |
| 123 | std::string feature_word; |
| 124 | if (word.size() > max_word_length) { |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 125 | feature_word = |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 126 | "^" + word.substr(0, max_word_length / 2) + "\1" + |
| 127 | word.substr(word.size() - max_word_length / 2, max_word_length / 2) + |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 128 | "$"; |
| 129 | } else { |
| 130 | // Add a prefix and suffix to the word. |
| 131 | feature_word = "^" + word + "$"; |
| 132 | } |
| 133 | |
| 134 | // Upper-bound the number of charactergram extracted to avoid resizing. |
| 135 | result.reserve(options_.chargram_orders.size() * feature_word.size()); |
| 136 | |
Lukas Zilka | e5ea2ab | 2017-10-11 10:50:05 +0200 | [diff] [blame] | 137 | if (options_.chargram_orders.empty()) { |
| 138 | result.push_back(HashToken(feature_word)); |
| 139 | } else { |
| 140 | // Generate the character-grams. |
| 141 | for (int chargram_order : options_.chargram_orders) { |
| 142 | if (chargram_order == 1) { |
| 143 | for (int i = 1; i < feature_word.size() - 1; ++i) { |
| 144 | result.push_back( |
| 145 | HashToken(StringPiece(feature_word, /*offset=*/i, /*len=*/1))); |
| 146 | } |
| 147 | } else { |
| 148 | for (int i = 0; |
| 149 | i < static_cast<int>(feature_word.size()) - chargram_order + 1; |
| 150 | ++i) { |
| 151 | result.push_back(HashToken(StringPiece(feature_word, /*offset=*/i, |
| 152 | /*len=*/chargram_order))); |
| 153 | } |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 154 | } |
| 155 | } |
| 156 | } |
| 157 | } |
| 158 | return result; |
| 159 | } |
| 160 | |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 161 | std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesUnicode( |
| 162 | const Token& token) const { |
| 163 | std::vector<int> result; |
Lukas Zilka | 26e8c2e | 2017-04-06 15:54:24 +0200 | [diff] [blame] | 164 | if (token.is_padding || token.value.empty()) { |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 165 | result.push_back(HashToken("<PAD>")); |
| 166 | } else { |
| 167 | UnicodeText word = UTF8ToUnicodeText(token.value, /*do_copy=*/false); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 168 | RemapTokenUnicode(token.value, options_, unilib_, &word); |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 169 | |
| 170 | // Trim the word if needed by finding a left-cut point and right-cut point. |
| 171 | auto left_cut = word.begin(); |
| 172 | auto right_cut = word.end(); |
| 173 | for (int i = 0; i < options_.max_word_length / 2; i++) { |
| 174 | if (left_cut < right_cut) { |
| 175 | ++left_cut; |
| 176 | } |
| 177 | if (left_cut < right_cut) { |
| 178 | --right_cut; |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | std::string feature_word; |
| 183 | if (left_cut == right_cut) { |
| 184 | feature_word = "^" + word.UTF8Substring(word.begin(), word.end()) + "$"; |
| 185 | } else { |
| 186 | // clang-format off |
| 187 | feature_word = "^" + |
| 188 | word.UTF8Substring(word.begin(), left_cut) + |
| 189 | "\1" + |
| 190 | word.UTF8Substring(right_cut, word.end()) + |
| 191 | "$"; |
| 192 | // clang-format on |
| 193 | } |
| 194 | |
| 195 | const UnicodeText feature_word_unicode = |
| 196 | UTF8ToUnicodeText(feature_word, /*do_copy=*/false); |
| 197 | |
| 198 | // Upper-bound the number of charactergram extracted to avoid resizing. |
| 199 | result.reserve(options_.chargram_orders.size() * feature_word.size()); |
| 200 | |
Lukas Zilka | e5ea2ab | 2017-10-11 10:50:05 +0200 | [diff] [blame] | 201 | if (options_.chargram_orders.empty()) { |
| 202 | result.push_back(HashToken(feature_word)); |
| 203 | } else { |
| 204 | // Generate the character-grams. |
| 205 | for (int chargram_order : options_.chargram_orders) { |
| 206 | UnicodeText::const_iterator it_start = feature_word_unicode.begin(); |
| 207 | UnicodeText::const_iterator it_end = feature_word_unicode.end(); |
| 208 | if (chargram_order == 1) { |
| 209 | ++it_start; |
| 210 | --it_end; |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 211 | } |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 212 | |
Lukas Zilka | e5ea2ab | 2017-10-11 10:50:05 +0200 | [diff] [blame] | 213 | UnicodeText::const_iterator it_chargram_start = it_start; |
| 214 | UnicodeText::const_iterator it_chargram_end = it_start; |
| 215 | bool chargram_is_complete = true; |
| 216 | for (int i = 0; i < chargram_order; ++i) { |
| 217 | if (it_chargram_end == it_end) { |
| 218 | chargram_is_complete = false; |
| 219 | break; |
| 220 | } |
| 221 | ++it_chargram_end; |
| 222 | } |
| 223 | if (!chargram_is_complete) { |
| 224 | continue; |
| 225 | } |
| 226 | |
| 227 | for (; it_chargram_end <= it_end; |
| 228 | ++it_chargram_start, ++it_chargram_end) { |
| 229 | const int length_bytes = |
| 230 | it_chargram_end.utf8_data() - it_chargram_start.utf8_data(); |
| 231 | result.push_back(HashToken( |
| 232 | StringPiece(it_chargram_start.utf8_data(), length_bytes))); |
| 233 | } |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 234 | } |
| 235 | } |
| 236 | } |
| 237 | return result; |
| 238 | } |
| 239 | |
Lukas Zilka | 6bb39a8 | 2017-04-07 19:55:11 +0200 | [diff] [blame] | 240 | bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span, |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 241 | std::vector<int>* sparse_features, |
| 242 | std::vector<float>* dense_features) const { |
| 243 | if (sparse_features == nullptr || dense_features == nullptr) { |
| 244 | return false; |
| 245 | } |
| 246 | |
| 247 | *sparse_features = ExtractCharactergramFeatures(token); |
| 248 | |
| 249 | if (options_.extract_case_feature) { |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 250 | if (options_.unicode_aware_features) { |
| 251 | UnicodeText token_unicode = |
| 252 | UTF8ToUnicodeText(token.value, /*do_copy=*/false); |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 253 | const bool is_upper = unilib_.IsUpper(*token_unicode.begin()); |
Lukas Zilka | e5ea2ab | 2017-10-11 10:50:05 +0200 | [diff] [blame] | 254 | if (!token.value.empty() && is_upper) { |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 255 | dense_features->push_back(1.0); |
| 256 | } else { |
| 257 | dense_features->push_back(-1.0); |
| 258 | } |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 259 | } else { |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 260 | if (!token.value.empty() && isupper(*token.value.begin())) { |
| 261 | dense_features->push_back(1.0); |
| 262 | } else { |
| 263 | dense_features->push_back(-1.0); |
| 264 | } |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 265 | } |
| 266 | } |
| 267 | |
| 268 | if (options_.extract_selection_mask_feature) { |
Lukas Zilka | 6bb39a8 | 2017-04-07 19:55:11 +0200 | [diff] [blame] | 269 | if (is_in_span) { |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 270 | dense_features->push_back(1.0); |
| 271 | } else { |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 272 | if (options_.unicode_aware_features) { |
| 273 | dense_features->push_back(-1.0); |
| 274 | } else { |
| 275 | dense_features->push_back(0.0); |
| 276 | } |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 277 | } |
| 278 | } |
| 279 | |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 280 | // Add regexp features. |
| 281 | if (!regex_patterns_.empty()) { |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 282 | for (int i = 0; i < regex_patterns_.size(); ++i) { |
| 283 | if (!regex_patterns_[i].get()) { |
| 284 | dense_features->push_back(-1.0); |
| 285 | continue; |
| 286 | } |
| 287 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 288 | if (regex_patterns_[i]->Matches(token.value)) { |
Lukas Zilka | d3bc59a | 2017-04-03 17:32:27 +0200 | [diff] [blame] | 289 | dense_features->push_back(1.0); |
| 290 | } else { |
| 291 | dense_features->push_back(-1.0); |
| 292 | } |
| 293 | } |
| 294 | } |
Lukas Zilka | e5ea2ab | 2017-10-11 10:50:05 +0200 | [diff] [blame] | 295 | |
Matt Sharifi | bda09f1 | 2017-03-10 12:29:15 +0100 | [diff] [blame] | 296 | return true; |
| 297 | } |
| 298 | |
Lukas Zilka | 21d8c98 | 2018-01-24 11:11:20 +0100 | [diff] [blame] | 299 | } // namespace libtextclassifier2 |