Blame - token-feature-extractor.cc - platform/external/libtextclassifier

blob: e1941796597db14b5db5cf186628f4bc4e692450 [file] [log] [blame]

Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	17	#include "token-feature-extractor.h"
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	18
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	19	#include <cctype>
Matt Sharifi	deb722d	2017-04-24 13:30:47 +0200	[diff] [blame]	20	#include <string>
				21
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	22	#include "util/base/logging.h"
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	23	#include "util/hash/farmhash.h"
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	24	#include "util/strings/stringpiece.h"
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	25	#include "util/utf8/unicodetext.h"
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	26
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	27	namespace libtextclassifier2 {
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	28
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	29	namespace {
				30
Matt Sharifi	deb722d	2017-04-24 13:30:47 +0200	[diff] [blame]	31	std::string RemapTokenAscii(const std::string& token,
				32	const TokenFeatureExtractorOptions& options) {
				33	if (!options.remap_digits && !options.lowercase_tokens) {
				34	return token;
				35	}
				36
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	37	std::string copy = token;
				38	for (int i = 0; i < token.size(); ++i) {
Matt Sharifi	deb722d	2017-04-24 13:30:47 +0200	[diff] [blame]	39	if (options.remap_digits && isdigit(copy[i])) {
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	40	copy[i] = '0';
				41	}
Matt Sharifi	deb722d	2017-04-24 13:30:47 +0200	[diff] [blame]	42	if (options.lowercase_tokens) {
				43	copy[i] = tolower(copy[i]);
				44	}
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	45	}
				46	return copy;
				47	}
				48
Matt Sharifi	deb722d	2017-04-24 13:30:47 +0200	[diff] [blame]	49	void RemapTokenUnicode(const std::string& token,
				50	const TokenFeatureExtractorOptions& options,
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	51	const UniLib& unilib, UnicodeText* remapped) {
Matt Sharifi	deb722d	2017-04-24 13:30:47 +0200	[diff] [blame]	52	if (!options.remap_digits && !options.lowercase_tokens) {
				53	// Leave remapped untouched.
				54	return;
				55	}
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	56
				57	UnicodeText word = UTF8ToUnicodeText(token, /do_copy=/false);
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	58	remapped->clear();
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	59	for (auto it = word.begin(); it != word.end(); ++it) {
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	60	if (options.remap_digits && unilib.IsDigit(*it)) {
				61	remapped->AppendCodepoint('0');
Matt Sharifi	deb722d	2017-04-24 13:30:47 +0200	[diff] [blame]	62	} else if (options.lowercase_tokens) {
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	63	remapped->AppendCodepoint(unilib.ToLower(*it));
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	64	} else {
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	65	remapped->AppendCodepoint(*it);
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	66	}
				67	}
				68	}
				69
				70	} // namespace
				71
				72	TokenFeatureExtractor::TokenFeatureExtractor(
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	73	const TokenFeatureExtractorOptions& options, const UniLib& unilib)
				74	: options_(options), unilib_(unilib) {
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	75	for (const std::string& pattern : options.regexp_features) {
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	76	regex_patterns_.push_back(std::unique_ptr<UniLib::RegexPattern>(
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame^]	77	unilib_.CreateRegexPattern(UTF8ToUnicodeText(
				78	pattern.c_str(), pattern.size(), /do_copy=/false))));
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	79	}
				80	}
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	81
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame^]	82	bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span,
				83	std::vector<int>* sparse_features,
				84	std::vector<float>* dense_features) const {
				85	if (sparse_features == nullptr \|\| dense_features == nullptr) {
				86	return false;
				87	}
				88	*sparse_features = ExtractCharactergramFeatures(token);
				89	*dense_features = ExtractDenseFeatures(token, is_in_span);
				90	return true;
				91	}
				92
				93	std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures(
				94	const Token& token) const {
				95	if (options_.unicode_aware_features) {
				96	return ExtractCharactergramFeaturesUnicode(token);
				97	} else {
				98	return ExtractCharactergramFeaturesAscii(token);
				99	}
				100	}
				101
				102	std::vector<float> TokenFeatureExtractor::ExtractDenseFeatures(
				103	const Token& token, bool is_in_span) const {
				104	std::vector<float> dense_features;
				105
				106	if (options_.extract_case_feature) {
				107	if (options_.unicode_aware_features) {
				108	UnicodeText token_unicode =
				109	UTF8ToUnicodeText(token.value, /do_copy=/false);
				110	const bool is_upper = unilib_.IsUpper(*token_unicode.begin());
				111	if (!token.value.empty() && is_upper) {
				112	dense_features.push_back(1.0);
				113	} else {
				114	dense_features.push_back(-1.0);
				115	}
				116	} else {
				117	if (!token.value.empty() && isupper(*token.value.begin())) {
				118	dense_features.push_back(1.0);
				119	} else {
				120	dense_features.push_back(-1.0);
				121	}
				122	}
				123	}
				124
				125	if (options_.extract_selection_mask_feature) {
				126	if (is_in_span) {
				127	dense_features.push_back(1.0);
				128	} else {
				129	if (options_.unicode_aware_features) {
				130	dense_features.push_back(-1.0);
				131	} else {
				132	dense_features.push_back(0.0);
				133	}
				134	}
				135	}
				136
				137	// Add regexp features.
				138	if (!regex_patterns_.empty()) {
				139	UnicodeText token_unicode =
				140	UTF8ToUnicodeText(token.value, /do_copy=/false);
				141	for (int i = 0; i < regex_patterns_.size(); ++i) {
				142	if (!regex_patterns_[i].get()) {
				143	dense_features.push_back(-1.0);
				144	continue;
				145	}
				146	auto matcher = regex_patterns_[i]->Matcher(token_unicode);
				147	int status;
				148	if (matcher->Matches(&status)) {
				149	dense_features.push_back(1.0);
				150	} else {
				151	dense_features.push_back(-1.0);
				152	}
				153	}
				154	}
				155
				156	return dense_features;
				157	}
				158
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	159	int TokenFeatureExtractor::HashToken(StringPiece token) const {
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	160	if (options_.allowed_chargrams.empty()) {
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	161	return tc2farmhash::Fingerprint64(token) % options_.num_buckets;
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	162	} else {
				163	// Padding and out-of-vocabulary tokens have extra buckets reserved because
				164	// they are special and important tokens, and we don't want them to share
				165	// embedding with other charactergrams.
				166	// TODO(zilka): Experimentally verify.
				167	const int kNumExtraBuckets = 2;
				168	const std::string token_string = token.ToString();
				169	if (token_string == "<PAD>") {
				170	return 1;
				171	} else if (options_.allowed_chargrams.find(token_string) ==
				172	options_.allowed_chargrams.end()) {
				173	return 0; // Out-of-vocabulary.
				174	} else {
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	175	return (tc2farmhash::Fingerprint64(token) %
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	176	(options_.num_buckets - kNumExtraBuckets)) +
				177	kNumExtraBuckets;
				178	}
				179	}
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	180	}
				181
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	182	std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesAscii(
				183	const Token& token) const {
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	184	std::vector<int> result;
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	185	if (token.is_padding \|\| token.value.empty()) {
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	186	result.push_back(HashToken("<PAD>"));
				187	} else {
Matt Sharifi	deb722d	2017-04-24 13:30:47 +0200	[diff] [blame]	188	const std::string word = RemapTokenAscii(token.value, options_);
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	189
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	190	// Trim words that are over max_word_length characters.
				191	const int max_word_length = options_.max_word_length;
				192	std::string feature_word;
				193	if (word.size() > max_word_length) {
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	194	feature_word =
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	195	"^" + word.substr(0, max_word_length / 2) + "\1" +
				196	word.substr(word.size() - max_word_length / 2, max_word_length / 2) +
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	197	"$";
				198	} else {
				199	// Add a prefix and suffix to the word.
				200	feature_word = "^" + word + "$";
				201	}
				202
				203	// Upper-bound the number of charactergram extracted to avoid resizing.
				204	result.reserve(options_.chargram_orders.size() * feature_word.size());
				205
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	206	if (options_.chargram_orders.empty()) {
				207	result.push_back(HashToken(feature_word));
				208	} else {
				209	// Generate the character-grams.
				210	for (int chargram_order : options_.chargram_orders) {
				211	if (chargram_order == 1) {
				212	for (int i = 1; i < feature_word.size() - 1; ++i) {
				213	result.push_back(
				214	HashToken(StringPiece(feature_word, /offset=/i, /len=/1)));
				215	}
				216	} else {
				217	for (int i = 0;
				218	i < static_cast<int>(feature_word.size()) - chargram_order + 1;
				219	++i) {
				220	result.push_back(HashToken(StringPiece(feature_word, /offset=/i,
				221	/len=/chargram_order)));
				222	}
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	223	}
				224	}
				225	}
				226	}
				227	return result;
				228	}
				229
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	230	std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesUnicode(
				231	const Token& token) const {
				232	std::vector<int> result;
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	233	if (token.is_padding \|\| token.value.empty()) {
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	234	result.push_back(HashToken("<PAD>"));
				235	} else {
				236	UnicodeText word = UTF8ToUnicodeText(token.value, /do_copy=/false);
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	237	RemapTokenUnicode(token.value, options_, unilib_, &word);
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	238
				239	// Trim the word if needed by finding a left-cut point and right-cut point.
				240	auto left_cut = word.begin();
				241	auto right_cut = word.end();
				242	for (int i = 0; i < options_.max_word_length / 2; i++) {
				243	if (left_cut < right_cut) {
				244	++left_cut;
				245	}
				246	if (left_cut < right_cut) {
				247	--right_cut;
				248	}
				249	}
				250
				251	std::string feature_word;
				252	if (left_cut == right_cut) {
				253	feature_word = "^" + word.UTF8Substring(word.begin(), word.end()) + "$";
				254	} else {
				255	// clang-format off
				256	feature_word = "^" +
				257	word.UTF8Substring(word.begin(), left_cut) +
				258	"\1" +
				259	word.UTF8Substring(right_cut, word.end()) +
				260	"$";
				261	// clang-format on
				262	}
				263
				264	const UnicodeText feature_word_unicode =
				265	UTF8ToUnicodeText(feature_word, /do_copy=/false);
				266
				267	// Upper-bound the number of charactergram extracted to avoid resizing.
				268	result.reserve(options_.chargram_orders.size() * feature_word.size());
				269
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	270	if (options_.chargram_orders.empty()) {
				271	result.push_back(HashToken(feature_word));
				272	} else {
				273	// Generate the character-grams.
				274	for (int chargram_order : options_.chargram_orders) {
				275	UnicodeText::const_iterator it_start = feature_word_unicode.begin();
				276	UnicodeText::const_iterator it_end = feature_word_unicode.end();
				277	if (chargram_order == 1) {
				278	++it_start;
				279	--it_end;
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	280	}
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	281
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	282	UnicodeText::const_iterator it_chargram_start = it_start;
				283	UnicodeText::const_iterator it_chargram_end = it_start;
				284	bool chargram_is_complete = true;
				285	for (int i = 0; i < chargram_order; ++i) {
				286	if (it_chargram_end == it_end) {
				287	chargram_is_complete = false;
				288	break;
				289	}
				290	++it_chargram_end;
				291	}
				292	if (!chargram_is_complete) {
				293	continue;
				294	}
				295
				296	for (; it_chargram_end <= it_end;
				297	++it_chargram_start, ++it_chargram_end) {
				298	const int length_bytes =
				299	it_chargram_end.utf8_data() - it_chargram_start.utf8_data();
				300	result.push_back(HashToken(
				301	StringPiece(it_chargram_start.utf8_data(), length_bytes)));
				302	}
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	303	}
				304	}
				305	}
				306	return result;
				307	}
				308
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	309	} // namespace libtextclassifier2