blob: 65072f30a5920aa56249e7df8330ee915518289c [file] [log] [blame]
Lukas Zilka21d8c982018-01-24 11:11:20 +01001/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "tokenizer.h"
18
19#include <vector>
20
21#include "gmock/gmock.h"
22#include "gtest/gtest.h"
23
24namespace libtextclassifier2 {
25namespace {
26
27using testing::ElementsAreArray;
28
29class TestingTokenizer : public Tokenizer {
30 public:
31 explicit TestingTokenizer(
32 const std::vector<const TokenizationCodepointRange*>&
33 codepoint_range_configs,
34 bool split_on_script_change)
35 : Tokenizer(codepoint_range_configs, split_on_script_change) {}
36
37 using Tokenizer::FindTokenizationRange;
38};
39
40class TestingTokenizerProxy {
41 public:
42 explicit TestingTokenizerProxy(
43 const std::vector<TokenizationCodepointRangeT>& codepoint_range_configs,
44 bool split_on_script_change) {
45 int num_configs = codepoint_range_configs.size();
46 std::vector<const TokenizationCodepointRange*> configs_fb;
47 buffers_.reserve(num_configs);
48 for (int i = 0; i < num_configs; i++) {
49 flatbuffers::FlatBufferBuilder builder;
50 builder.Finish(CreateTokenizationCodepointRange(
51 builder, &codepoint_range_configs[i]));
52 buffers_.push_back(builder.Release());
53 configs_fb.push_back(
54 flatbuffers::GetRoot<TokenizationCodepointRange>(buffers_[i].data()));
55 }
56 tokenizer_ = std::unique_ptr<TestingTokenizer>(
57 new TestingTokenizer(configs_fb, split_on_script_change));
58 }
59
60 TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const {
Lukas Zilkaba849e72018-03-08 14:48:21 +010061 const TokenizationCodepointRangeT* range =
Lukas Zilka21d8c982018-01-24 11:11:20 +010062 tokenizer_->FindTokenizationRange(c);
63 if (range != nullptr) {
Lukas Zilkaba849e72018-03-08 14:48:21 +010064 return range->role;
Lukas Zilka21d8c982018-01-24 11:11:20 +010065 } else {
66 return TokenizationCodepointRange_::Role_DEFAULT_ROLE;
67 }
68 }
69
70 std::vector<Token> Tokenize(const std::string& utf8_text) const {
71 return tokenizer_->Tokenize(utf8_text);
72 }
73
74 private:
75 std::vector<flatbuffers::DetachedBuffer> buffers_;
76 std::unique_ptr<TestingTokenizer> tokenizer_;
77};
78
79TEST(TokenizerTest, FindTokenizationRange) {
80 std::vector<TokenizationCodepointRangeT> configs;
81 TokenizationCodepointRangeT* config;
82
83 configs.emplace_back();
84 config = &configs.back();
85 config->start = 0;
86 config->end = 10;
87 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
88
89 configs.emplace_back();
90 config = &configs.back();
91 config->start = 32;
92 config->end = 33;
93 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
94
95 configs.emplace_back();
96 config = &configs.back();
97 config->start = 1234;
98 config->end = 12345;
99 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
100
101 TestingTokenizerProxy tokenizer(configs, /*split_on_script_change=*/false);
102
103 // Test hits to the first group.
104 EXPECT_EQ(tokenizer.TestFindTokenizationRole(0),
105 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
106 EXPECT_EQ(tokenizer.TestFindTokenizationRole(5),
107 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
108 EXPECT_EQ(tokenizer.TestFindTokenizationRole(10),
109 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
110
111 // Test a hit to the second group.
112 EXPECT_EQ(tokenizer.TestFindTokenizationRole(31),
113 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
114 EXPECT_EQ(tokenizer.TestFindTokenizationRole(32),
115 TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR);
116 EXPECT_EQ(tokenizer.TestFindTokenizationRole(33),
117 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
118
119 // Test hits to the third group.
120 EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233),
121 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
122 EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234),
123 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
124 EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344),
125 TokenizationCodepointRange_::Role_TOKEN_SEPARATOR);
126 EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345),
127 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
128
129 // Test a hit outside.
130 EXPECT_EQ(tokenizer.TestFindTokenizationRole(99),
131 TokenizationCodepointRange_::Role_DEFAULT_ROLE);
132}
133
134TEST(TokenizerTest, TokenizeOnSpace) {
135 std::vector<TokenizationCodepointRangeT> configs;
136 TokenizationCodepointRangeT* config;
137
138 configs.emplace_back();
139 config = &configs.back();
140 // Space character.
141 config->start = 32;
142 config->end = 33;
143 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
144
145 TestingTokenizerProxy tokenizer(configs, /*split_on_script_change=*/false);
146 std::vector<Token> tokens = tokenizer.Tokenize("Hello world!");
147
148 EXPECT_THAT(tokens,
149 ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)}));
150}
151
152TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) {
153 std::vector<TokenizationCodepointRangeT> configs;
154 TokenizationCodepointRangeT* config;
155
156 // Latin.
157 configs.emplace_back();
158 config = &configs.back();
159 config->start = 0;
160 config->end = 32;
161 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
162 config->script_id = 1;
163 configs.emplace_back();
164 config = &configs.back();
165 config->start = 32;
166 config->end = 33;
167 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
168 config->script_id = 1;
169 configs.emplace_back();
170 config = &configs.back();
171 config->start = 33;
172 config->end = 0x77F + 1;
173 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
174 config->script_id = 1;
175
176 TestingTokenizerProxy tokenizer(configs, /*split_on_script_change=*/true);
177 EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"),
178 std::vector<Token>({Token("앨라배마", 0, 4), Token("주", 5, 6),
179 Token("전화", 7, 10), Token("(123)", 10, 15),
180 Token("456-789", 16, 23),
181 Token("웹사이트", 23, 28)}));
182} // namespace
183
184TEST(TokenizerTest, TokenizeComplex) {
185 std::vector<TokenizationCodepointRangeT> configs;
186 TokenizationCodepointRangeT* config;
187
188 // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt
189 // Latin - cyrilic.
190 // 0000..007F; Basic Latin
191 // 0080..00FF; Latin-1 Supplement
192 // 0100..017F; Latin Extended-A
193 // 0180..024F; Latin Extended-B
194 // 0250..02AF; IPA Extensions
195 // 02B0..02FF; Spacing Modifier Letters
196 // 0300..036F; Combining Diacritical Marks
197 // 0370..03FF; Greek and Coptic
198 // 0400..04FF; Cyrillic
199 // 0500..052F; Cyrillic Supplement
200 // 0530..058F; Armenian
201 // 0590..05FF; Hebrew
202 // 0600..06FF; Arabic
203 // 0700..074F; Syriac
204 // 0750..077F; Arabic Supplement
205 configs.emplace_back();
206 config = &configs.back();
207 config->start = 0;
208 config->end = 32;
209 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
210 configs.emplace_back();
211 config = &configs.back();
212 config->start = 32;
213 config->end = 33;
214 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
215 configs.emplace_back();
216 config = &configs.back();
217 config->start = 33;
218 config->end = 0x77F + 1;
219 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
220
221 // CJK
222 // 2E80..2EFF; CJK Radicals Supplement
223 // 3000..303F; CJK Symbols and Punctuation
224 // 3040..309F; Hiragana
225 // 30A0..30FF; Katakana
226 // 3100..312F; Bopomofo
227 // 3130..318F; Hangul Compatibility Jamo
228 // 3190..319F; Kanbun
229 // 31A0..31BF; Bopomofo Extended
230 // 31C0..31EF; CJK Strokes
231 // 31F0..31FF; Katakana Phonetic Extensions
232 // 3200..32FF; Enclosed CJK Letters and Months
233 // 3300..33FF; CJK Compatibility
234 // 3400..4DBF; CJK Unified Ideographs Extension A
235 // 4DC0..4DFF; Yijing Hexagram Symbols
236 // 4E00..9FFF; CJK Unified Ideographs
237 // A000..A48F; Yi Syllables
238 // A490..A4CF; Yi Radicals
239 // A4D0..A4FF; Lisu
240 // A500..A63F; Vai
241 // F900..FAFF; CJK Compatibility Ideographs
242 // FE30..FE4F; CJK Compatibility Forms
243 // 20000..2A6DF; CJK Unified Ideographs Extension B
244 // 2A700..2B73F; CJK Unified Ideographs Extension C
245 // 2B740..2B81F; CJK Unified Ideographs Extension D
246 // 2B820..2CEAF; CJK Unified Ideographs Extension E
247 // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
248 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
249 configs.emplace_back();
250 config = &configs.back();
251 config->start = 0x2E80;
252 config->end = 0x2EFF + 1;
253 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
254 configs.emplace_back();
255 config = &configs.back();
256 config->start = 0x3000;
257 config->end = 0xA63F + 1;
258 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
259 configs.emplace_back();
260 config = &configs.back();
261 config->start = 0xF900;
262 config->end = 0xFAFF + 1;
263 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
264 configs.emplace_back();
265 config = &configs.back();
266 config->start = 0xFE30;
267 config->end = 0xFE4F + 1;
268 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
269 configs.emplace_back();
270 config = &configs.back();
271 config->start = 0x20000;
272 config->end = 0x2A6DF + 1;
273 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
274 configs.emplace_back();
275 config = &configs.back();
276 config->start = 0x2A700;
277 config->end = 0x2B73F + 1;
278 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
279 configs.emplace_back();
280 config = &configs.back();
281 config->start = 0x2B740;
282 config->end = 0x2B81F + 1;
283 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
284 configs.emplace_back();
285 config = &configs.back();
286 config->start = 0x2B820;
287 config->end = 0x2CEAF + 1;
288 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
289 configs.emplace_back();
290 config = &configs.back();
291 config->start = 0x2CEB0;
292 config->end = 0x2EBEF + 1;
293 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
294 configs.emplace_back();
295 config = &configs.back();
296 config->start = 0x2F800;
297 config->end = 0x2FA1F + 1;
298 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
299
300 // Thai.
301 // 0E00..0E7F; Thai
302 configs.emplace_back();
303 config = &configs.back();
304 config->start = 0x0E00;
305 config->end = 0x0E7F + 1;
306 config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR;
307
308 TestingTokenizerProxy tokenizer(configs, /*split_on_script_change=*/false);
309 std::vector<Token> tokens;
310
311 tokens = tokenizer.Tokenize(
312 "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。");
313 EXPECT_EQ(tokens.size(), 30);
314
315 tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ");
316 // clang-format off
317 EXPECT_THAT(
318 tokens,
319 ElementsAreArray({Token("問", 0, 1),
320 Token("少", 1, 2),
321 Token("目", 2, 3),
322 Token("hello", 4, 9),
323 Token("木", 10, 11),
324 Token("輸", 11, 12),
325 Token("ย", 12, 13),
326 Token("า", 13, 14),
327 Token("ม", 14, 15),
328 Token("き", 15, 16),
329 Token("ゃ", 16, 17)}));
330 // clang-format on
331}
332
333} // namespace
334} // namespace libtextclassifier2