Blame - smartselect/token-feature-extractor.h - platform/external/libtextclassifier

blob: 9ba695e76842bac5532458ba81156027693024b7 [file] [log] [blame]

Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_
				18	#define LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_
				19
				20	#include <vector>
				21
				22	#include "base.h"
				23	#include "smartselect/types.h"
				24
				25	namespace libtextclassifier {
				26
				27	struct TokenFeatureExtractorOptions {
				28	// Number of buckets used for hashing charactergrams.
				29	int num_buckets;
				30
				31	// Orders of charactergrams to extract. E.g., 2 means character bigrams, 3
				32	// character trigrams etc.
				33	std::vector<int> chargram_orders;
				34
				35	// Whether to extract the token case feature.
				36	bool extract_case_feature;
				37
				38	// Whether to extract the selection mask feature.
				39	bool extract_selection_mask_feature;
				40	};
				41
				42	class TokenFeatureExtractor {
				43	public:
				44	explicit TokenFeatureExtractor(const TokenFeatureExtractorOptions& options)
				45	: options_(options) {}
				46
				47	// Extracts features from a token.
				48	// - sparse_features are indices into a sparse feature vector of size
				49	// options.num_buckets which are set to 1.0 (others are implicitly 0.0).
				50	// - dense_features are values of a dense feature vector of size 0-2
				51	// (depending on the options) for the token
				52	bool Extract(const Token& token, std::vector<int>* sparse_features,
				53	std::vector<float>* dense_features) const;
				54
				55	// Convenience method that sequentially applies Extract to each Token.
				56	bool Extract(const std::vector<Token>& tokens,
				57	std::vector<std::vector<int>>* sparse_features,
				58	std::vector<std::vector<float>>* dense_features) const;
				59
				60	protected:
				61	// Hashes given token to given number of buckets.
				62	int HashToken(const std::string& token) const;
				63
				64	// Extracts the charactergram features from the token.
				65	std::vector<int> ExtractCharactergramFeatures(const Token& token) const;
				66
				67	private:
				68	TokenFeatureExtractorOptions options_;
				69	};
				70
				71	} // namespace libtextclassifier
				72
				73	#endif // LIBTEXTCLASSIFIER_SMARTSELECT_TOKEN_FEATURE_EXTRACTOR_H_