blob: 9ed152ff661888d4989314e7006e8b8210a4fa61 [file] [log] [blame]
Matt Sharifibda09f12017-03-10 12:29:15 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17// Tokenizer.
18
19#ifndef LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_
20#define LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_
21
22#include <string>
23#include <vector>
24
25#include "smartselect/tokenizer.pb.h"
26#include "smartselect/types.h"
27#include "util/base/integral_types.h"
28
29namespace libtextclassifier {
30
31// Represents a codepoint range [start, end) with its role for tokenization.
32struct CodepointRange {
33 int32 start;
34 int32 end;
35 TokenizationCodepointRange::Role role;
36
37 CodepointRange(int32 arg_start, int32 arg_end,
38 TokenizationCodepointRange::Role arg_role)
39 : start(arg_start), end(arg_end), role(arg_role) {}
40};
41
42// Tokenizer splits the input string into a sequence of tokens, according to the
43// configuration.
44class Tokenizer {
45 public:
46 explicit Tokenizer(
47 const std::vector<TokenizationCodepointRange>& codepoint_range_configs) {
48 PrepareTokenizationCodepointRanges(codepoint_range_configs);
49 }
50
51 // Tokenizes the input string using the selected tokenization method.
52 std::vector<Token> Tokenize(const std::string& utf8_text) const;
53
54 protected:
55 // Prepares tokenization codepoint ranges for use in tokenization.
56 void PrepareTokenizationCodepointRanges(
57 const std::vector<TokenizationCodepointRange> codepoint_range_configs);
58
59 // Finds the tokenization role for given codepoint.
60 // If the character is not found returns DEFAULT_ROLE.
61 // Internally uses binary search so should be O(log2(# of codepoint_ranges)).
62 TokenizationCodepointRange::Role FindTokenizationRole(int codepoint) const;
63
64 private:
65 // Codepoint ranges that determine how different codepoints are tokenized.
66 // The ranges must not overlap.
67 std::vector<CodepointRange> codepoint_ranges_;
68};
69
70} // namespace libtextclassifier
71
72#endif // LIBTEXTCLASSIFIER_SMARTSELECT_TOKENIZER_H_