Blame - tokenizer.h - platform/external/libtextclassifier

blob: 72a9fbde4cdac5e221e35ae0751e8bd8258315db [file] [log] [blame]

Lukas Zilka	21d8c98	2018-01-24 11:11:20 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2017 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef KNOWLEDGE_CEREBRA_SENSE_TEXT_CLASSIFIER_LIB2_TOKENIZER_H_
				18	#define KNOWLEDGE_CEREBRA_SENSE_TEXT_CLASSIFIER_LIB2_TOKENIZER_H_
				19
				20	#include <string>
				21	#include <vector>
				22
				23	#include "model_generated.h"
				24	#include "types.h"
				25	#include "util/base/integral_types.h"
				26
				27	namespace libtextclassifier2 {
				28
				29	const int kInvalidScript = -1;
				30	const int kUnknownScript = -2;
				31
				32	// Tokenizer splits the input string into a sequence of tokens, according to the
				33	// configuration.
				34	class Tokenizer {
				35	public:
				36	explicit Tokenizer(
				37	const std::vector<const TokenizationCodepointRange*>& codepoint_ranges,
				38	bool split_on_script_change);
				39
				40	// Tokenizes the input string using the selected tokenization method.
				41	std::vector<Token> Tokenize(const std::string& utf8_text) const;
				42
				43	protected:
				44	// Finds the tokenization codepoint range config for given codepoint.
				45	// Internally uses binary search so should be O(log(# of codepoint_ranges)).
				46	const TokenizationCodepointRange* FindTokenizationRange(int codepoint) const;
				47
				48	// Finds the role and script for given codepoint. If not found, DEFAULT_ROLE
				49	// and kUnknownScript are assigned.
				50	void GetScriptAndRole(char32 codepoint,
				51	TokenizationCodepointRange_::Role* role,
				52	int* script) const;
				53
				54	private:
				55	// Codepoint ranges that determine how different codepoints are tokenized.
				56	// The ranges must not overlap.
				57	std::vector<const TokenizationCodepointRange*> codepoint_ranges_;
				58
				59	// If true, tokens will be additionally split when the codepoint's script_id
				60	// changes.
				61	bool split_on_script_change_;
				62	};
				63
				64	} // namespace libtextclassifier2
				65
				66	#endif // KNOWLEDGE_CEREBRA_SENSE_TEXT_CLASSIFIER_LIB2_TOKENIZER_H_