blob: 328492523f4d4f3c6a576f0dca721c24dfe9fa4b [file] [log] [blame]
Narayan Kamatha5afcfc2015-01-29 20:06:46 +00001/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "utf.h"
18
19#include "common_runtime_test.h"
20#include "utf-inl.h"
21
Vladimir Markoe3bbc3f2015-11-25 11:10:20 +000022#include <map>
Narayan Kamathe16dad12015-02-13 11:49:22 +000023#include <vector>
24
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000025namespace art {
26
27class UtfTest : public CommonRuntimeTest {};
28
29TEST_F(UtfTest, GetLeadingUtf16Char) {
30 EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
31}
32
33TEST_F(UtfTest, GetTrailingUtf16Char) {
34 EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
35 EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
36}
37
38#define EXPECT_ARRAY_POSITION(expected, end, start) \
39 EXPECT_EQ(static_cast<uintptr_t>(expected), \
40 reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
41
42// A test string containing one, two, three and four byte UTF-8 sequences.
43static const uint8_t kAllSequences[] = {
44 0x24,
45 0xc2, 0xa2,
46 0xe2, 0x82, 0xac,
47 0xf0, 0x9f, 0x8f, 0xa0,
48 0x00
49};
50
51// A test string that contains a UTF-8 encoding of a surrogate pair
Bruce Hoult1646d7a2015-10-28 15:06:12 +030052// (code point = U+10400).
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000053static const uint8_t kSurrogateEncoding[] = {
54 0xed, 0xa0, 0x81,
55 0xed, 0xb0, 0x80,
56 0x00
57};
58
59TEST_F(UtfTest, GetUtf16FromUtf8) {
60 const char* const start = reinterpret_cast<const char*>(kAllSequences);
61 const char* ptr = start;
62 uint32_t pair = 0;
63
64 // Single byte sequence.
65 pair = GetUtf16FromUtf8(&ptr);
66 EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
67 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
68 EXPECT_ARRAY_POSITION(1, ptr, start);
69
Bruce Hoult1646d7a2015-10-28 15:06:12 +030070 // Two byte sequence.
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000071 pair = GetUtf16FromUtf8(&ptr);
72 EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
73 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
74 EXPECT_ARRAY_POSITION(3, ptr, start);
75
Bruce Hoult1646d7a2015-10-28 15:06:12 +030076 // Three byte sequence.
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000077 pair = GetUtf16FromUtf8(&ptr);
78 EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
79 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
80 EXPECT_ARRAY_POSITION(6, ptr, start);
81
82 // Four byte sequence
83 pair = GetUtf16FromUtf8(&ptr);
84 EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
85 EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
86 EXPECT_ARRAY_POSITION(10, ptr, start);
87
Bruce Hoult1646d7a2015-10-28 15:06:12 +030088 // Null terminator.
Narayan Kamatha5afcfc2015-01-29 20:06:46 +000089 pair = GetUtf16FromUtf8(&ptr);
90 EXPECT_EQ(0, GetLeadingUtf16Char(pair));
91 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
92 EXPECT_ARRAY_POSITION(11, ptr, start);
93}
94
95TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
96 const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
97 const char* ptr = start;
98 uint32_t pair = 0;
99
100 pair = GetUtf16FromUtf8(&ptr);
101 EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
102 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
103 EXPECT_ARRAY_POSITION(3, ptr, start);
104
105 pair = GetUtf16FromUtf8(&ptr);
106 EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
107 EXPECT_EQ(0, GetTrailingUtf16Char(pair));
108 EXPECT_ARRAY_POSITION(6, ptr, start);
109}
110
111TEST_F(UtfTest, CountModifiedUtf8Chars) {
112 EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
113 EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
114}
115
Narayan Kamathe16dad12015-02-13 11:49:22 +0000116static void AssertConversion(const std::vector<uint16_t> input,
117 const std::vector<uint8_t> expected) {
118 ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
119
120 std::vector<uint8_t> output(expected.size());
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300121 ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
122 &input[0], input.size());
Narayan Kamathe16dad12015-02-13 11:49:22 +0000123 EXPECT_EQ(expected, output);
124}
125
126TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
127 // Surrogate pairs will be converted into 4 byte sequences.
128 AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
129
130 // Three byte encodings that are below & above the leading surrogate
131 // range respectively.
132 AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 });
133 AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf });
134 // Two byte encoding.
135 AssertConversion({ 0x0101 }, { 0xc4, 0x81 });
136
137 // Two byte special case : 0 must use an overlong encoding.
138 AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 });
139
140 // One byte encoding.
141 AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
142
143 AssertConversion({
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300144 0xd802, 0xdc02, // Surrogate pair.
145 0xdef0, 0xdcff, // Three byte encodings.
146 0x0101, 0x0000, // Two byte encodings.
147 'p' , 'p' // One byte encoding.
Narayan Kamathe16dad12015-02-13 11:49:22 +0000148 }, {
149 0xf0, 0x90, 0xa0, 0x82,
150 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
151 0xc4, 0x81, 0xc0, 0x80,
152 0x70, 0x70
153 });
154}
155
156TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) {
157 // Unpaired trailing surrogate at the end of input.
158 AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
159 // Unpaired (or incorrectly paired) surrogates in the middle of the input.
Vladimir Markoe3bbc3f2015-11-25 11:10:20 +0000160 const std::map<std::vector<uint16_t>, std::vector<uint8_t>> prefixes {
161 {{ 'h' }, { 'h' }},
162 {{ 0 }, { 0xc0, 0x80 }},
163 {{ 0x81 }, { 0xc2, 0x81 }},
164 {{ 0x801 }, { 0xe0, 0xa0, 0x81 }},
165 };
166 const std::map<std::vector<uint16_t>, std::vector<uint8_t>> suffixes {
167 {{ 'e' }, { 'e' }},
168 {{ 0 }, { 0xc0, 0x80 }},
169 {{ 0x7ff }, { 0xdf, 0xbf }},
170 {{ 0xffff }, { 0xef, 0xbf, 0xbf }},
171 };
172 const std::map<std::vector<uint16_t>, std::vector<uint8_t>> tests {
173 {{ 0xd801 }, { 0xed, 0xa0, 0x81 }},
174 {{ 0xdc00 }, { 0xed, 0xb0, 0x80 }},
175 {{ 0xd801, 0xd801 }, { 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81 }},
176 {{ 0xdc00, 0xdc00 }, { 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80 }},
177 };
178 for (const auto& prefix : prefixes) {
179 const std::vector<uint16_t>& prefix_in = prefix.first;
180 const std::vector<uint8_t>& prefix_out = prefix.second;
181 for (const auto& test : tests) {
182 const std::vector<uint16_t>& test_in = test.first;
183 const std::vector<uint8_t>& test_out = test.second;
184 for (const auto& suffix : suffixes) {
185 const std::vector<uint16_t>& suffix_in = suffix.first;
186 const std::vector<uint8_t>& suffix_out = suffix.second;
187 std::vector<uint16_t> in = prefix_in;
188 in.insert(in.end(), test_in.begin(), test_in.end());
189 in.insert(in.end(), suffix_in.begin(), suffix_in.end());
190 std::vector<uint8_t> out = prefix_out;
191 out.insert(out.end(), test_out.begin(), test_out.end());
192 out.insert(out.end(), suffix_out.begin(), suffix_out.end());
193 AssertConversion(in, out);
194 }
195 }
196 }
Narayan Kamathe16dad12015-02-13 11:49:22 +0000197}
198
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300199// Old versions of functions, here to compare answers with optimized versions.
200
201size_t CountModifiedUtf8Chars_reference(const char* utf8) {
202 size_t len = 0;
203 int ic;
204 while ((ic = *utf8++) != '\0') {
205 len++;
206 if ((ic & 0x80) == 0) {
207 // one-byte encoding
208 continue;
209 }
210 // two- or three-byte encoding
211 utf8++;
212 if ((ic & 0x20) == 0) {
213 // two-byte encoding
214 continue;
215 }
216 utf8++;
217 if ((ic & 0x10) == 0) {
218 // three-byte encoding
219 continue;
220 }
221
222 // four-byte encoding: needs to be converted into a surrogate
223 // pair.
224 utf8++;
225 len++;
226 }
227 return len;
228}
229
230static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
231 size_t result = 0;
232 while (char_count--) {
233 const uint16_t ch = *chars++;
234 if (ch > 0 && ch <= 0x7f) {
235 ++result;
236 } else if (ch >= 0xd800 && ch <= 0xdbff) {
237 if (char_count > 0) {
238 const uint16_t ch2 = *chars;
239 // If we find a properly paired surrogate, we emit it as a 4 byte
240 // UTF sequence. If we find an unpaired leading or trailing surrogate,
241 // we emit it as a 3 byte sequence like would have done earlier.
242 if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
243 chars++;
244 char_count--;
245
246 result += 4;
247 } else {
248 result += 3;
249 }
250 } else {
251 // This implies we found an unpaired trailing surrogate at the end
252 // of a string.
253 result += 3;
254 }
255 } else if (ch > 0x7ff) {
256 result += 3;
257 } else {
258 result += 2;
259 }
260 }
261 return result;
262}
263
264static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in,
265 size_t char_count) {
266 while (char_count--) {
267 const uint16_t ch = *utf16_in++;
268 if (ch > 0 && ch <= 0x7f) {
269 *utf8_out++ = ch;
270 } else {
271 // Char_count == 0 here implies we've encountered an unpaired
272 // surrogate and we have no choice but to encode it as 3-byte UTF
273 // sequence. Note that unpaired surrogates can occur as a part of
274 // "normal" operation.
275 if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
276 const uint16_t ch2 = *utf16_in;
277
278 // Check if the other half of the pair is within the expected
279 // range. If it isn't, we will have to emit both "halves" as
280 // separate 3 byte sequences.
281 if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
282 utf16_in++;
283 char_count--;
284 const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
285 *utf8_out++ = (code_point >> 18) | 0xf0;
286 *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
287 *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
288 *utf8_out++ = (code_point & 0x3f) | 0x80;
289 continue;
290 }
291 }
292
293 if (ch > 0x07ff) {
294 // Three byte encoding.
295 *utf8_out++ = (ch >> 12) | 0xe0;
296 *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
297 *utf8_out++ = (ch & 0x3f) | 0x80;
298 } else /*(ch > 0x7f || ch == 0)*/ {
299 // Two byte encoding.
300 *utf8_out++ = (ch >> 6) | 0xc0;
301 *utf8_out++ = (ch & 0x3f) | 0x80;
302 }
303 }
304 }
305}
306
307// Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again.
308
309static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) {
310 first = (code_point >> 10) + 0xd7c0;
311 second = (code_point & 0x03ff) + 0xdc00;
312}
313
314static void testConversions(uint16_t *buf, int char_count) {
Andreas Gampe4464a3e2016-03-03 20:15:47 -0800315 char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 };
316 uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 };
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300317 int byte_count_test, byte_count_reference;
318 int char_count_test, char_count_reference;
319
320 // Calculate the number of utf-8 bytes for the utf-16 chars.
321 byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
322 byte_count_test = CountUtf8Bytes(buf, char_count);
323 EXPECT_EQ(byte_count_reference, byte_count_test);
324
325 // Convert the utf-16 string to utf-8 bytes.
326 ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
327 ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
328 for (int i = 0; i < byte_count_test; ++i) {
329 EXPECT_EQ(bytes_reference[i], bytes_test[i]);
330 }
331
332 // Calculate the number of utf-16 chars from the utf-8 bytes.
333 bytes_reference[byte_count_reference] = 0; // Reference function needs null termination.
334 char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
335 char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
336 EXPECT_EQ(char_count, char_count_reference);
337 EXPECT_EQ(char_count, char_count_test);
338
339 // Convert the utf-8 bytes back to utf-16 chars.
340 // Does not need copied _reference version of the function because the original
341 // function with the old API is retained for debug/testing code.
342 ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
343 ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
344 for (int i = 0; i < char_count_test; ++i) {
345 EXPECT_EQ(buf[i], out_buf_reference[i]);
346 EXPECT_EQ(buf[i], out_buf_test[i]);
347 }
348}
349
350TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) {
351 for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) {
Andreas Gampe4464a3e2016-03-03 20:15:47 -0800352 uint16_t buf[4] = { 0 };
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300353 if (codePoint <= 0xffff) {
354 if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
355 // According to the Unicode standard, no character will ever
Roland Levillain91d65e02016-01-19 15:59:16 +0000356 // be assigned to these code points, and they cannot be encoded
Bruce Hoult1646d7a2015-10-28 15:06:12 +0300357 // into either utf-16 or utf-8.
358 continue;
359 }
360 buf[0] = 'h';
361 buf[1] = codePoint;
362 buf[2] = 'e';
363 testConversions(buf, 2);
364 testConversions(buf, 3);
365 testConversions(buf + 1, 1);
366 testConversions(buf + 1, 2);
367 } else {
368 buf[0] = 'h';
369 codePointToSurrogatePair(codePoint, buf[1], buf[2]);
370 buf[3] = 'e';
371 testConversions(buf, 2);
372 testConversions(buf, 3);
373 testConversions(buf, 4);
374 testConversions(buf + 1, 1);
375 testConversions(buf + 1, 2);
376 testConversions(buf + 1, 3);
377 }
378 }
379}
380
Narayan Kamatha5afcfc2015-01-29 20:06:46 +0000381} // namespace art