Blame - smartselect/feature-processor.h - platform/external/libtextclassifier

blob: ef9a3df2bf764a4637618b3dc3ead418c0d2d97b [file] [log] [blame]

Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	// Feature processing for FFModel (feed-forward SmartSelection model).
				18
				19	#ifndef LIBTEXTCLASSIFIER_SMARTSELECT_FEATURE_PROCESSOR_H_
				20	#define LIBTEXTCLASSIFIER_SMARTSELECT_FEATURE_PROCESSOR_H_
				21
				22	#include <memory>
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	23	#include <set>
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	24	#include <string>
				25	#include <vector>
				26
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	27	#include "smartselect/cached-features.h"
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	28	#include "smartselect/text-classification-model.pb.h"
				29	#include "smartselect/token-feature-extractor.h"
				30	#include "smartselect/tokenizer.h"
				31	#include "smartselect/types.h"
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	32	#include "util/base/logging.h"
Matt Sharifi	f95c3bd	2017-04-25 18:41:11 +0200	[diff] [blame]	33	#include "util/utf8/unicodetext.h"
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	34
				35	namespace libtextclassifier {
				36
				37	constexpr int kInvalidLabel = -1;
				38
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	39	// Maps a vector of sparse features and a vector of dense features to a vector
				40	// of features that combines both.
				41	// The output is written to the memory location pointed to by the last float*
				42	// argument.
				43	// Returns true on success false on failure.
				44	using FeatureVectorFn = std::function<bool(const std::vector<int>&,
				45	const std::vector<float>&, float*)>;
				46
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	47	namespace internal {
				48
				49	// Parses the serialized protocol buffer.
				50	FeatureProcessorOptions ParseSerializedOptions(
				51	const std::string& serialized_options);
				52
				53	TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
				54	const FeatureProcessorOptions& options);
				55
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	56	// Splits tokens that contain the selection boundary inside them.
				57	// E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com"
				58	void SplitTokensOnSelectionBoundaries(CodepointSpan selection,
				59	std::vector<Token>* tokens);
				60
Matt Sharifi	be876dc	2017-03-17 17:02:43 +0100	[diff] [blame]	61	// Returns the index of token that corresponds to the codepoint span.
				62	int CenterTokenFromClick(CodepointSpan span, const std::vector<Token>& tokens);
				63
				64	// Returns the index of token that corresponds to the middle of the codepoint
				65	// span.
				66	int CenterTokenFromMiddleOfSelection(
				67	CodepointSpan span, const std::vector<Token>& selectable_tokens);
				68
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	69	// Strips the tokens from the tokens vector that are not used for feature
				70	// extraction because they are out of scope, or pads them so that there is
				71	// enough tokens in the required context_size for all inferences with a click
				72	// in relative_click_span.
				73	void StripOrPadTokens(TokenSpan relative_click_span, int context_size,
				74	std::vector<Token>* tokens, int* click_pos);
				75
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	76	} // namespace internal
				77
Lukas Zilka	40c18de	2017-04-10 17:22:22 +0200	[diff] [blame]	78	// Converts a codepoint span to a token span in the given list of tokens.
Lukas Zilka	726b4d2	2017-12-13 16:37:03 +0100	[diff] [blame^]	79	// If snap_boundaries_to_containing_tokens is set to true, it is enough for a
				80	// token to overlap with the codepoint range to be considered part of it.
				81	// Otherwise it must be fully included in the range.
				82	TokenSpan CodepointSpanToTokenSpan(
				83	const std::vector<Token>& selectable_tokens, CodepointSpan codepoint_span,
				84	bool snap_boundaries_to_containing_tokens = false);
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	85
Lukas Zilka	40c18de	2017-04-10 17:22:22 +0200	[diff] [blame]	86	// Converts a token span to a codepoint span in the given list of tokens.
				87	CodepointSpan TokenSpanToCodepointSpan(
				88	const std::vector<Token>& selectable_tokens, TokenSpan token_span);
				89
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	90	// Takes care of preparing features for the span prediction model.
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	91	class FeatureProcessor {
				92	public:
				93	explicit FeatureProcessor(const FeatureProcessorOptions& options)
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	94	: feature_extractor_(
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	95	internal::BuildTokenFeatureExtractorOptions(options)),
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	96	options_(options),
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	97	tokenizer_({options.tokenization_codepoint_config().begin(),
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	98	options.tokenization_codepoint_config().end()}) {
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	99	MakeLabelMaps();
Matt Sharifi	f95c3bd	2017-04-25 18:41:11 +0200	[diff] [blame]	100	PrepareCodepointRanges({options.supported_codepoint_ranges().begin(),
				101	options.supported_codepoint_ranges().end()},
				102	&supported_codepoint_ranges_);
				103	PrepareCodepointRanges(
				104	{options.internal_tokenizer_codepoint_ranges().begin(),
				105	options.internal_tokenizer_codepoint_ranges().end()},
				106	&internal_tokenizer_codepoint_ranges_);
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	107	PrepareIgnoredSpanBoundaryCodepoints();
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	108	}
				109
				110	explicit FeatureProcessor(const std::string& serialized_options)
				111	: FeatureProcessor(internal::ParseSerializedOptions(serialized_options)) {
				112	}
				113
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	114	// Tokenizes the input string using the selected tokenization method.
				115	std::vector<Token> Tokenize(const std::string& utf8_text) const;
				116
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	117	// Converts a label into a token span.
				118	bool LabelToTokenSpan(int label, TokenSpan* token_span) const;
				119
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	120	// Gets the total number of selection labels.
				121	int GetSelectionLabelCount() const { return label_to_selection_.size(); }
				122
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	123	// Gets the string value for given collection label.
				124	std::string LabelToCollection(int label) const;
				125
				126	// Gets the total number of collections of the model.
				127	int NumCollections() const { return collection_to_label_.size(); }
				128
				129	// Gets the name of the default collection.
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	130	std::string GetDefaultCollection() const;
				131
				132	const FeatureProcessorOptions& GetOptions() const { return options_; }
				133
				134	// Tokenizes the context and input span, and finds the click position.
				135	void TokenizeAndFindClick(const std::string& context,
				136	CodepointSpan input_span,
				137	std::vector<Token>* tokens, int* click_pos) const;
				138
				139	// Extracts features as a CachedFeatures object that can be used for repeated
				140	// inference over token spans in the given context.
Lukas Zilka	726b4d2	2017-12-13 16:37:03 +0100	[diff] [blame^]	141	// When input_span == {kInvalidIndex, kInvalidIndex} then, relative_click_span
				142	// is ignored, and all tokens extracted from context will be considered.
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	143	bool ExtractFeatures(const std::string& context, CodepointSpan input_span,
				144	TokenSpan relative_click_span,
				145	const FeatureVectorFn& feature_vector_fn,
				146	int feature_vector_size, std::vector<Token>* tokens,
				147	int* click_pos,
				148	std::unique_ptr<CachedFeatures>* cached_features) const;
				149
				150	// Fills selection_label_spans with CodepointSpans that correspond to the
				151	// selection labels. The CodepointSpans are based on the codepoint ranges of
				152	// given tokens.
				153	bool SelectionLabelSpans(
				154	VectorSpan<Token> tokens,
				155	std::vector<CodepointSpan>* selection_label_spans) const;
				156
				157	int DenseFeaturesCount() const {
				158	return feature_extractor_.DenseFeaturesCount();
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	159	}
				160
Lukas Zilka	726b4d2	2017-12-13 16:37:03 +0100	[diff] [blame^]	161	// Splits context to several segments according to configuration.
				162	std::vector<UnicodeTextRange> SplitContext(
				163	const UnicodeText& context_unicode) const;
				164
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	165	// Strips boundary codepoints from the span in context and returns the new
				166	// start and end indices. If the span comprises entirely of boundary
				167	// codepoints, the first index of span is returned for both indices.
				168	CodepointSpan StripBoundaryCodepoints(const std::string& context,
				169	CodepointSpan span) const;
				170
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	171	protected:
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	172	// Represents a codepoint range [start, end).
				173	struct CodepointRange {
				174	int32 start;
				175	int32 end;
				176
				177	CodepointRange(int32 arg_start, int32 arg_end)
				178	: start(arg_start), end(arg_end) {}
				179	};
				180
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	181	// Returns the class id corresponding to the given string collection
				182	// identifier. There is a catch-all class id that the function returns for
				183	// unknown collections.
				184	int CollectionToLabel(const std::string& collection) const;
				185
				186	// Prepares mapping from collection names to labels.
				187	void MakeLabelMaps();
				188
				189	// Gets the number of spannable tokens for the model.
				190	//
				191	// Spannable tokens are those tokens of context, which the model predicts
				192	// selection spans over (i.e., there is 1:1 correspondence between the output
				193	// classes of the model and each of the spannable tokens).
				194	int GetNumContextTokens() const { return options_.context_size() * 2 + 1; }
				195
				196	// Converts a label into a span of codepoint indices corresponding to it
				197	// given output_tokens.
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	198	bool LabelToSpan(int label, const VectorSpan<Token>& output_tokens,
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	199	CodepointSpan* span) const;
				200
				201	// Converts a span to the corresponding label given output_tokens.
				202	bool SpanToLabel(const std::pair<CodepointIndex, CodepointIndex>& span,
				203	const std::vector<Token>& output_tokens, int* label) const;
				204
				205	// Converts a token span to the corresponding label.
				206	int TokenSpanToLabel(const std::pair<TokenIndex, TokenIndex>& span) const;
				207
Matt Sharifi	f95c3bd	2017-04-25 18:41:11 +0200	[diff] [blame]	208	void PrepareCodepointRanges(
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	209	const std::vector<FeatureProcessorOptions::CodepointRange>&
Matt Sharifi	f95c3bd	2017-04-25 18:41:11 +0200	[diff] [blame]	210	codepoint_ranges,
				211	std::vector<CodepointRange>* prepared_codepoint_ranges);
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	212
				213	// Returns the ratio of supported codepoints to total number of codepoints in
				214	// the input context around given click position.
				215	float SupportedCodepointsRatio(int click_pos,
				216	const std::vector<Token>& tokens) const;
				217
Matt Sharifi	f95c3bd	2017-04-25 18:41:11 +0200	[diff] [blame]	218	// Returns true if given codepoint is covered by the given sorted vector of
				219	// codepoint ranges.
				220	bool IsCodepointInRanges(
				221	int codepoint, const std::vector<CodepointRange>& codepoint_ranges) const;
Lukas Zilka	26e8c2e	2017-04-06 15:54:24 +0200	[diff] [blame]	222
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	223	void PrepareIgnoredSpanBoundaryCodepoints();
				224
				225	// Counts the number of span boundary codepoints. If count_from_beginning is
				226	// True, the counting will start at the span_start iterator (inclusive) and at
				227	// maximum end at span_end (exclusive). If count_from_beginning is True, the
				228	// counting will start from span_end (exclusive) and end at span_start
				229	// (inclusive).
				230	int CountIgnoredSpanBoundaryCodepoints(
				231	const UnicodeText::const_iterator& span_start,
				232	const UnicodeText::const_iterator& span_end,
				233	bool count_from_beginning) const;
				234
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	235	// Finds the center token index in tokens vector, using the method defined
				236	// in options_.
				237	int FindCenterToken(CodepointSpan span,
				238	const std::vector<Token>& tokens) const;
				239
Lukas Zilka	40c18de	2017-04-10 17:22:22 +0200	[diff] [blame]	240	// Tokenizes the input text using ICU tokenizer.
				241	bool ICUTokenize(const std::string& context,
				242	std::vector<Token>* result) const;
				243
Matt Sharifi	f95c3bd	2017-04-25 18:41:11 +0200	[diff] [blame]	244	// Takes the result of ICU tokenization and retokenizes stretches of tokens
				245	// made of a specific subset of characters using the internal tokenizer.
				246	void InternalRetokenize(const std::string& context,
				247	std::vector<Token>* tokens) const;
				248
				249	// Tokenizes a substring of the unicode string, appending the resulting tokens
				250	// to the output vector. The resulting tokens have bounds relative to the full
				251	// string. Does nothing if the start of the span is negative.
				252	void TokenizeSubstring(const UnicodeText& unicode_text, CodepointSpan span,
				253	std::vector<Token>* result) const;
				254
Lukas Zilka	726b4d2	2017-12-13 16:37:03 +0100	[diff] [blame^]	255	// Removes all tokens from tokens that are not on a line (defined by calling
				256	// SplitContext on the context) to which span points.
				257	void StripTokensFromOtherLines(const std::string& context, CodepointSpan span,
				258	std::vector<Token>* tokens) const;
				259
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	260	const TokenFeatureExtractor feature_extractor_;
				261
Matt Sharifi	f95c3bd	2017-04-25 18:41:11 +0200	[diff] [blame]	262	// Codepoint ranges that define what codepoints are supported by the model.
				263	// NOTE: Must be sorted.
				264	std::vector<CodepointRange> supported_codepoint_ranges_;
				265
				266	// Codepoint ranges that define which tokens (consisting of which codepoints)
				267	// should be re-tokenized with the internal tokenizer in the mixed
				268	// tokenization mode.
				269	// NOTE: Must be sorted.
				270	std::vector<CodepointRange> internal_tokenizer_codepoint_ranges_;
				271
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	272	private:
Lukas Zilka	e5ea2ab	2017-10-11 10:50:05 +0200	[diff] [blame]	273	// Set of codepoints that will be stripped from beginning and end of
				274	// predicted spans.
				275	std::set<int32> ignored_span_boundary_codepoints_;
				276
Lukas Zilka	6bb39a8	2017-04-07 19:55:11 +0200	[diff] [blame]	277	const FeatureProcessorOptions options_;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	278
				279	// Mapping between token selection spans and labels ids.
				280	std::map<TokenSpan, int> selection_to_label_;
				281	std::vector<TokenSpan> label_to_selection_;
				282
				283	// Mapping between collections and labels.
				284	std::map<std::string, int> collection_to_label_;
				285
				286	Tokenizer tokenizer_;
Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame]	287	};
				288
				289	} // namespace libtextclassifier
				290
				291	#endif // LIBTEXTCLASSIFIER_SMARTSELECT_FEATURE_PROCESSOR_H_