Blame - smartselect/tokenizer.h - platform/external/libtextclassifier

blob: 9ed152ff661888d4989314e7006e8b8210a4fa61 [file] [log] [blame]

Matt Sharifi	bda09f1	2017-03-10 12:29:15 +0100	[diff] [blame^]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	// Tokenizer.
				18
				19	#ifndef LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_
				20	#define LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_
				21
				22	#include <string>
				23	#include <vector>
				24
				25	#include "smartselect/tokenizer.pb.h"
				26	#include "smartselect/types.h"
				27	#include "util/base/integral_types.h"
				28
				29	namespace libtextclassifier {
				30
				31	// Represents a codepoint range [start, end) with its role for tokenization.
				32	struct CodepointRange {
				33	int32 start;
				34	int32 end;
				35	TokenizationCodepointRange::Role role;
				36
				37	CodepointRange(int32 arg_start, int32 arg_end,
				38	TokenizationCodepointRange::Role arg_role)
				39	: start(arg_start), end(arg_end), role(arg_role) {}
				40	};
				41
				42	// Tokenizer splits the input string into a sequence of tokens, according to the
				43	// configuration.
				44	class Tokenizer {
				45	public:
				46	explicit Tokenizer(
				47	const std::vector<TokenizationCodepointRange>& codepoint_range_configs) {
				48	PrepareTokenizationCodepointRanges(codepoint_range_configs);
				49	}
				50
				51	// Tokenizes the input string using the selected tokenization method.
				52	std::vector<Token> Tokenize(const std::string& utf8_text) const;
				53
				54	protected:
				55	// Prepares tokenization codepoint ranges for use in tokenization.
				56	void PrepareTokenizationCodepointRanges(
				57	const std::vector<TokenizationCodepointRange> codepoint_range_configs);
				58
				59	// Finds the tokenization role for given codepoint.
				60	// If the character is not found returns DEFAULT_ROLE.
				61	// Internally uses binary search so should be O(log2(# of codepoint_ranges)).
				62	TokenizationCodepointRange::Role FindTokenizationRole(int codepoint) const;
				63
				64	private:
				65	// Codepoint ranges that determine how different codepoints are tokenized.
				66	// The ranges must not overlap.
				67	std::vector<CodepointRange> codepoint_ranges_;
				68	};
				69
				70	} // namespace libtextclassifier
				71
				72	#endif // LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_