blob: d83bea01ed4ca4964c1d6dd6c9005689bdf89875 [file] [log] [blame]
Tony Mak378c1f52019-03-04 15:58:11 +00001/*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
18#define LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
19
20#include <string>
21#include <unordered_set>
22#include <vector>
23
Tony Mak378c1f52019-03-04 15:58:11 +000024#include "annotator/model_generated.h"
25#include "annotator/types.h"
Tony Maka2a1ff42019-09-12 15:40:32 +010026#include "utils/base/logging.h"
Tony Mak968412a2019-11-13 15:39:57 +000027#include "utils/container/sorted-strings-table.h"
Tony Mak63959242020-02-07 18:31:16 +000028#include "utils/tokenizer.h"
Tony Mak378c1f52019-03-04 15:58:11 +000029#include "utils/utf8/unicodetext.h"
30
31namespace libtextclassifier3 {
32
33// Annotator of numbers in text.
34//
Tony Mak63959242020-02-07 18:31:16 +000035// Integer supported values are in range [-1 000 000 000, 1 000 000 000].
36// Doble supposted values are in range [-999999999.999999999,
37// 999999999.999999999].
Tony Mak378c1f52019-03-04 15:58:11 +000038class NumberAnnotator {
39 public:
40 explicit NumberAnnotator(const NumberAnnotatorOptions* options,
Tony Mak63959242020-02-07 18:31:16 +000041 const UniLib* unilib)
Tony Mak378c1f52019-03-04 15:58:11 +000042 : options_(options),
Tony Mak63959242020-02-07 18:31:16 +000043 unilib_(unilib),
44 tokenizer_(Tokenizer(TokenizationType_LETTER_DIGIT, unilib,
45 /*codepoint_ranges=*/{},
46 /*internal_tokenizer_codepoint_ranges=*/{},
47 /*split_on_script_change=*/false,
48 /*icu_preserve_whitespace_tokens=*/true)),
Tony Makd0ae7c62020-03-27 13:58:00 +000049 percent_suffixes_(FromFlatbufferStringToUnordredSet(
50 options_->percentage_pieces_string())),
Tony Mak63959242020-02-07 18:31:16 +000051 max_number_of_digits_(options->max_number_of_digits()) {}
Tony Mak378c1f52019-03-04 15:58:11 +000052
53 // Classifies given text, and if it is a number, it passes the result in
54 // 'classification_result' and returns true, otherwise returns false.
55 bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
56 AnnotationUsecase annotation_usecase,
57 ClassificationResult* classification_result) const;
58
Tony Mak63959242020-02-07 18:31:16 +000059 // Finds all number instances in the input text. Returns true in any case.
Tony Mak378c1f52019-03-04 15:58:11 +000060 bool FindAll(const UnicodeText& context_unicode,
61 AnnotationUsecase annotation_usecase,
62 std::vector<AnnotatedSpan>* result) const;
63
64 private:
Tony Makd0ae7c62020-03-27 13:58:00 +000065 // Converts a Flatbuffer string containing zero-separated percent suffixes
66 // to an unordered set.
67 static std::unordered_set<std::string> FromFlatbufferStringToUnordredSet(
68 const flatbuffers::String* flatbuffer_percent_strings);
Tony Maka2a1ff42019-09-12 15:40:32 +010069
70 // Checks if the annotated numbers from the context represent percentages.
71 // If yes, replaces the collection type and the annotation boundary in the
72 // result.
73 void FindPercentages(const UnicodeText& context,
74 std::vector<AnnotatedSpan>* result) const;
75
Tony Mak63959242020-02-07 18:31:16 +000076 // Checks if the tokens from in the interval [start_index-2, start_index] are
77 // valid characters that can preced a number context.
78 bool TokensAreValidStart(const std::vector<Token>& tokens,
Tony Makd0ae7c62020-03-27 13:58:00 +000079 int start_index) const;
Tony Mak63959242020-02-07 18:31:16 +000080
81 // Checks if the tokens in the interval (..., prefix_end_index] are a valid
82 // number prefix.
83 bool TokensAreValidNumberPrefix(const std::vector<Token>& tokens,
Tony Makd0ae7c62020-03-27 13:58:00 +000084 int prefix_end_index) const;
Tony Mak63959242020-02-07 18:31:16 +000085
86 // Checks if the tokens from in the interval [ending_index, ending_index+2]
87 // are valid characters that can follow a number context.
88 bool TokensAreValidEnding(const std::vector<Token>& tokens,
Tony Makd0ae7c62020-03-27 13:58:00 +000089 int ending_index) const;
Tony Mak63959242020-02-07 18:31:16 +000090
91 // Checks if the tokens in the interval [suffix_start_index, ...) are a valid
92 // number suffix.
93 bool TokensAreValidNumberSuffix(const std::vector<Token>& tokens,
Tony Makd0ae7c62020-03-27 13:58:00 +000094 int suffix_start_index) const;
95
96 // Checks if the tokens in the interval [suffix_start_index, ...) are a valid
97 // percent suffix. If false, returns -1, else returns the end codepoint.
98 int FindPercentSuffixEndCodepoint(const std::vector<Token>& tokens,
99 int suffix_token_start_index) const;
Tony Mak63959242020-02-07 18:31:16 +0000100
101 // Checks if the given text represents a number (either int or double).
Tony Makd0ae7c62020-03-27 13:58:00 +0000102 bool TryParseNumber(const UnicodeText& token_text, bool is_negative,
Tony Mak63959242020-02-07 18:31:16 +0000103 int64* parsed_int_value,
104 double* parsed_double_value) const;
105
106 // Checks if a word contains only CJT characters.
107 bool IsCJTterm(UnicodeText::const_iterator token_begin_it,
Tony Makd0ae7c62020-03-27 13:58:00 +0000108 int token_length) const;
109
110 AnnotatedSpan CreateAnnotatedSpan(int start, int end, int int_value,
111 double double_value,
112 const std::string collection, float score,
113 float priority_score) const;
Tony Mak63959242020-02-07 18:31:16 +0000114
Tony Mak378c1f52019-03-04 15:58:11 +0000115 const NumberAnnotatorOptions* options_;
Tony Mak63959242020-02-07 18:31:16 +0000116 const UniLib* unilib_;
117 const Tokenizer tokenizer_;
Tony Makd0ae7c62020-03-27 13:58:00 +0000118 const std::unordered_set<std::string> percent_suffixes_;
Tony Mak63959242020-02-07 18:31:16 +0000119 const int max_number_of_digits_;
Tony Mak378c1f52019-03-04 15:58:11 +0000120};
121
122} // namespace libtextclassifier3
123
124#endif // LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_