Blame - smartselect/token-feature-extractor.h - platform/external/libtextclassifier

blob: 8287fbde3d8b5b62a7e85e2bde7978d1a3e2b151 [file] [log] [blame]

Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_
				18	#define LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_
				19
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	20	#include <memory>
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	21	#include <vector>
				22
				23	#include "base.h"
				24	#include "smartselect/types.h"
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	25	#include "util/strings/stringpiece.h"
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	26	#include "unicode/regex.h"
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	27
				28	namespace libtextclassifier {
				29
				30	struct TokenFeatureExtractorOptions {
				31	// Number of buckets used for hashing charactergrams.
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	32	int num_buckets = 0;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	33
				34	// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
				35	// character trigrams etc.
				36	std::vector<int> chargram_orders;
				37
				38	// Whether to extract the token case feature.
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	39	bool extract_case_feature = false;
				40
				41	// If true, will use the unicode-aware functionality for extracting features.
				42	bool unicode_aware_features = false;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	43
				44	// Whether to extract the selection mask feature.
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	45	bool extract_selection_mask_feature = false;
				46
				47	// Regexp features to extract.
				48	std::vector<std::string> regexp_features;
				49
				50	// Whether to remap digits to a single number.
				51	bool remap_digits = false;
				52
Matt Sharifi	deb722d	2017-04-24 13:30:47 +0200	[diff] [blame]	53	// Whether to lowercase all tokens.
				54	bool lowercase_tokens = false;
				55
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	56	// Maximum length of a word.
				57	int max_word_length = 20;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	58	};
				59
				60	class TokenFeatureExtractor {
				61	public:
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	62	explicit TokenFeatureExtractor(const TokenFeatureExtractorOptions& options);
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	63
				64	// Extracts features from a token.
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	65	// - is_in_span is a bool indicator whether the token is a part of the
				66	// selection span (true) or not (false).
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	67	// - sparse_features are indices into a sparse feature vector of size
				68	// options.num_buckets which are set to 1.0 (others are implicitly 0.0).
				69	// - dense_features are values of a dense feature vector of size 0-2
				70	// (depending on the options) for the token
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	71	bool Extract(const Token& token, bool is_in_span,
				72	std::vector<int>* sparse_features,
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	73	std::vector<float>* dense_features) const;
				74
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	75	int DenseFeaturesCount() const {
				76	return options_.extract_case_feature +
				77	options_.extract_selection_mask_feature + regex_patterns_.size();
				78	}
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	79
				80	protected:
				81	// Hashes given token to given number of buckets.
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	82	int HashToken(StringPiece token) const;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	83
				84	// Extracts the charactergram features from the token.
				85	std::vector<int> ExtractCharactergramFeatures(const Token& token) const;
				86
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	87	// Extracts the charactergram features from the token in a non-unicode-aware
				88	// way.
				89	std::vector<int> ExtractCharactergramFeaturesAscii(const Token& token) const;
				90
				91	// Extracts the charactergram features from the token in a unicode-aware way.
				92	std::vector<int> ExtractCharactergramFeaturesUnicode(
				93	const Token& token) const;
				94
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	95	private:
				96	TokenFeatureExtractorOptions options_;
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	97
				98	std::vector<std::unique_ptr<icu::RegexPattern>> regex_patterns_;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	99	};
				100
				101	} // namespace libtextclassifier
				102
				103	#endif // LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_