Blame - native/annotator/number/number.h - platform/external/libtextclassifier

blob: d83bea01ed4ca4964c1d6dd6c9005689bdf89875 [file] [log] [blame]

Tony Mak	378c1f5	2019-03-04 15:58:11 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2018 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
				18	#define LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
				19
				20	#include <string>
				21	#include <unordered_set>
				22	#include <vector>
				23
Tony Mak	378c1f5	2019-03-04 15:58:11 +0000	[diff] [blame]	24	#include "annotator/model_generated.h"
				25	#include "annotator/types.h"
Tony Mak	a2a1ff4	2019-09-12 15:40:32 +0100	[diff] [blame]	26	#include "utils/base/logging.h"
Tony Mak	968412a	2019-11-13 15:39:57 +0000	[diff] [blame]	27	#include "utils/container/sorted-strings-table.h"
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	28	#include "utils/tokenizer.h"
Tony Mak	378c1f5	2019-03-04 15:58:11 +0000	[diff] [blame]	29	#include "utils/utf8/unicodetext.h"
				30
				31	namespace libtextclassifier3 {
				32
				33	// Annotator of numbers in text.
				34	//
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	35	// Integer supported values are in range [-1 000 000 000, 1 000 000 000].
				36	// Doble supposted values are in range [-999999999.999999999,
				37	// 999999999.999999999].
Tony Mak	378c1f5	2019-03-04 15:58:11 +0000	[diff] [blame]	38	class NumberAnnotator {
				39	public:
				40	explicit NumberAnnotator(const NumberAnnotatorOptions* options,
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	41	const UniLib* unilib)
Tony Mak	378c1f5	2019-03-04 15:58:11 +0000	[diff] [blame]	42	: options_(options),
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	43	unilib_(unilib),
				44	tokenizer_(Tokenizer(TokenizationType_LETTER_DIGIT, unilib,
				45	/codepoint_ranges=/{},
				46	/internal_tokenizer_codepoint_ranges=/{},
				47	/split_on_script_change=/false,
				48	/icu_preserve_whitespace_tokens=/true)),
Tony Mak	d0ae7c6	2020-03-27 13:58:00 +0000	[diff] [blame]	49	percent_suffixes_(FromFlatbufferStringToUnordredSet(
				50	options_->percentage_pieces_string())),
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	51	max_number_of_digits_(options->max_number_of_digits()) {}
Tony Mak	378c1f5	2019-03-04 15:58:11 +0000	[diff] [blame]	52
				53	// Classifies given text, and if it is a number, it passes the result in
				54	// 'classification_result' and returns true, otherwise returns false.
				55	bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
				56	AnnotationUsecase annotation_usecase,
				57	ClassificationResult* classification_result) const;
				58
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	59	// Finds all number instances in the input text. Returns true in any case.
Tony Mak	378c1f5	2019-03-04 15:58:11 +0000	[diff] [blame]	60	bool FindAll(const UnicodeText& context_unicode,
				61	AnnotationUsecase annotation_usecase,
				62	std::vector<AnnotatedSpan>* result) const;
				63
				64	private:
Tony Mak	d0ae7c6	2020-03-27 13:58:00 +0000	[diff] [blame]	65	// Converts a Flatbuffer string containing zero-separated percent suffixes
				66	// to an unordered set.
				67	static std::unordered_set<std::string> FromFlatbufferStringToUnordredSet(
				68	const flatbuffers::String* flatbuffer_percent_strings);
Tony Mak	a2a1ff4	2019-09-12 15:40:32 +0100	[diff] [blame]	69
				70	// Checks if the annotated numbers from the context represent percentages.
				71	// If yes, replaces the collection type and the annotation boundary in the
				72	// result.
				73	void FindPercentages(const UnicodeText& context,
				74	std::vector<AnnotatedSpan>* result) const;
				75
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	76	// Checks if the tokens from in the interval [start_index-2, start_index] are
				77	// valid characters that can preced a number context.
				78	bool TokensAreValidStart(const std::vector<Token>& tokens,
Tony Mak	d0ae7c6	2020-03-27 13:58:00 +0000	[diff] [blame]	79	int start_index) const;
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	80
				81	// Checks if the tokens in the interval (..., prefix_end_index] are a valid
				82	// number prefix.
				83	bool TokensAreValidNumberPrefix(const std::vector<Token>& tokens,
Tony Mak	d0ae7c6	2020-03-27 13:58:00 +0000	[diff] [blame]	84	int prefix_end_index) const;
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	85
				86	// Checks if the tokens from in the interval [ending_index, ending_index+2]
				87	// are valid characters that can follow a number context.
				88	bool TokensAreValidEnding(const std::vector<Token>& tokens,
Tony Mak	d0ae7c6	2020-03-27 13:58:00 +0000	[diff] [blame]	89	int ending_index) const;
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	90
				91	// Checks if the tokens in the interval [suffix_start_index, ...) are a valid
				92	// number suffix.
				93	bool TokensAreValidNumberSuffix(const std::vector<Token>& tokens,
Tony Mak	d0ae7c6	2020-03-27 13:58:00 +0000	[diff] [blame]	94	int suffix_start_index) const;
				95
				96	// Checks if the tokens in the interval [suffix_start_index, ...) are a valid
				97	// percent suffix. If false, returns -1, else returns the end codepoint.
				98	int FindPercentSuffixEndCodepoint(const std::vector<Token>& tokens,
				99	int suffix_token_start_index) const;
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	100
				101	// Checks if the given text represents a number (either int or double).
Tony Mak	d0ae7c6	2020-03-27 13:58:00 +0000	[diff] [blame]	102	bool TryParseNumber(const UnicodeText& token_text, bool is_negative,
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	103	int64* parsed_int_value,
				104	double* parsed_double_value) const;
				105
				106	// Checks if a word contains only CJT characters.
				107	bool IsCJTterm(UnicodeText::const_iterator token_begin_it,
Tony Mak	d0ae7c6	2020-03-27 13:58:00 +0000	[diff] [blame]	108	int token_length) const;
				109
				110	AnnotatedSpan CreateAnnotatedSpan(int start, int end, int int_value,
				111	double double_value,
				112	const std::string collection, float score,
				113	float priority_score) const;
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	114
Tony Mak	378c1f5	2019-03-04 15:58:11 +0000	[diff] [blame]	115	const NumberAnnotatorOptions* options_;
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	116	const UniLib* unilib_;
				117	const Tokenizer tokenizer_;
Tony Mak	d0ae7c6	2020-03-27 13:58:00 +0000	[diff] [blame]	118	const std::unordered_set<std::string> percent_suffixes_;
Tony Mak	6395924	2020-02-07 18:31:16 +0000	[diff] [blame]	119	const int max_number_of_digits_;
Tony Mak	378c1f5	2019-03-04 15:58:11 +0000	[diff] [blame]	120	};
				121
				122	} // namespace libtextclassifier3
				123
				124	#endif // LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_