Blame - token-feature-extractor.h - platform/external/libtextclassifier

blob: fee1355dcff60c52896c58542824b5a9e49249f0 [file] [log] [blame]

Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	17	#ifndef LIBTEXTCLASSIFIER_TOKEN_FEATURE_EXTRACTOR_H_
				18	#define LIBTEXTCLASSIFIER_TOKEN_FEATURE_EXTRACTOR_H_
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	19
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	20	#include <memory>
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	21	#include <unordered_set>
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	22	#include <vector>
				23
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	24	#include "types.h"
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	25	#include "util/strings/stringpiece.h"
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	26	#include "util/utf8/unilib.h"
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	27
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	28	namespace libtextclassifier2 {
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	29
				30	struct TokenFeatureExtractorOptions {
				31	// Number of buckets used for hashing charactergrams.
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	32	int num_buckets = 0;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	33
				34	// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
				35	// character trigrams etc.
				36	std::vector<int> chargram_orders;
				37
				38	// Whether to extract the token case feature.
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	39	bool extract_case_feature = false;
				40
				41	// If true, will use the unicode-aware functionality for extracting features.
				42	bool unicode_aware_features = false;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	43
				44	// Whether to extract the selection mask feature.
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	45	bool extract_selection_mask_feature = false;
				46
				47	// Regexp features to extract.
				48	std::vector<std::string> regexp_features;
				49
				50	// Whether to remap digits to a single number.
				51	bool remap_digits = false;
				52
Matt Sharifi	deb722d	2017-04-24 13:30:47 +0200	[diff] [blame]	53	// Whether to lowercase all tokens.
				54	bool lowercase_tokens = false;
				55
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	56	// Maximum length of a word.
				57	int max_word_length = 20;
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	58
				59	// List of allowed charactergrams. The extracted charactergrams are filtered
				60	// using this list, and charactergrams that are not present are interpreted as
				61	// out-of-vocabulary.
				62	// If no allowed_chargrams are specified, all charactergrams are allowed.
				63	std::unordered_set<std::string> allowed_chargrams;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	64	};
				65
				66	class TokenFeatureExtractor {
				67	public:
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	68	TokenFeatureExtractor(const TokenFeatureExtractorOptions& options,
				69	const UniLib& unilib);
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	70
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	71	// Extracts both the sparse (charactergram) and the dense features from a
				72	// token. is_in_span is a bool indicator whether the token is a part of the
				73	// selection span (true) or not (false).
Lukas Zilka	ba849e7	2018-03-08 14:48:21 +0100	[diff] [blame]	74	// The sparse_features output is optional. Fails and returns false if
				75	// dense_fatures in a nullptr.
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	76	bool Extract(const Token& token, bool is_in_span,
				77	std::vector<int>* sparse_features,
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	78	std::vector<float>* dense_features) const;
				79
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	80	// Extracts the sparse (charactergram) features from the token.
				81	std::vector<int> ExtractCharactergramFeatures(const Token& token) const;
				82
				83	// Extracts the dense features from the token. is_in_span is a bool indicator
				84	// whether the token is a part of the selection span (true) or not (false).
				85	std::vector<float> ExtractDenseFeatures(const Token& token,
				86	bool is_in_span) const;
				87
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	88	int DenseFeaturesCount() const {
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	89	int feature_count =
				90	options_.extract_case_feature + options_.extract_selection_mask_feature;
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	91	feature_count += regex_patterns_.size();
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	92	return feature_count;
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	93	}
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	94
				95	protected:
				96	// Hashes given token to given number of buckets.
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	97	int HashToken(StringPiece token) const;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	98
Lukas Zilka	d3bc59a	2017-04-03 17:32:27 +0200	[diff] [blame]	99	// Extracts the charactergram features from the token in a non-unicode-aware
				100	// way.
				101	std::vector<int> ExtractCharactergramFeaturesAscii(const Token& token) const;
				102
				103	// Extracts the charactergram features from the token in a unicode-aware way.
				104	std::vector<int> ExtractCharactergramFeaturesUnicode(
				105	const Token& token) const;
				106
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	107	private:
				108	TokenFeatureExtractorOptions options_;
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	109	std::vector<std::unique_ptr<UniLib::RegexPattern>> regex_patterns_;
				110	const UniLib& unilib_;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	111	};
				112
Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	113	} // namespace libtextclassifier2
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	114
Lukas Zilka	b23e212	2018-02-09 10:25:19 +0100	[diff] [blame]	115	#endif // LIBTEXTCLASSIFIER_TOKEN_FEATURE_EXTRACTOR_H_