blob: 3b71ed1b6a6e88fd6d4afd05da60fe4f0be95f71 [file] [log] [blame]
Reid Kleckner7df03c22013-07-16 17:14:33 +00001//===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "llvm/Support/ConvertUTF.h"
11#include "gtest/gtest.h"
12#include <string>
Dmitri Gribenko1089db02014-06-16 11:09:46 +000013#include <vector>
Reid Kleckner7df03c22013-07-16 17:14:33 +000014
15using namespace llvm;
16
17TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
18 // Src is the look of disapproval.
19 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
20 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
21 std::string Result;
22 bool Success = convertUTF16ToUTF8String(Ref, Result);
23 EXPECT_TRUE(Success);
24 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
25 EXPECT_EQ(Expected, Result);
26}
27
28TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
29 // Src is the look of disapproval.
30 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
31 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
32 std::string Result;
33 bool Success = convertUTF16ToUTF8String(Ref, Result);
34 EXPECT_TRUE(Success);
35 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
36 EXPECT_EQ(Expected, Result);
37}
38
39TEST(ConvertUTFTest, OddLengthInput) {
40 std::string Result;
41 bool Success = convertUTF16ToUTF8String(ArrayRef<char>("xxxxx", 5), Result);
42 EXPECT_FALSE(Success);
43}
44
45TEST(ConvertUTFTest, Empty) {
46 std::string Result;
47 bool Success = convertUTF16ToUTF8String(ArrayRef<char>(), Result);
48 EXPECT_TRUE(Success);
49 EXPECT_TRUE(Result.empty());
50}
51
52TEST(ConvertUTFTest, HasUTF16BOM) {
53 bool HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xff\xfe", 2));
54 EXPECT_TRUE(HasBOM);
55 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff", 2));
56 EXPECT_TRUE(HasBOM);
57 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff ", 3));
58 EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
59 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff\x00asdf", 6));
60 EXPECT_TRUE(HasBOM);
61
62 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>());
63 EXPECT_FALSE(HasBOM);
64 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe", 1));
65 EXPECT_FALSE(HasBOM);
66}
Dmitri Gribenko1089db02014-06-16 11:09:46 +000067
68std::pair<ConversionResult, std::vector<unsigned>>
69ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
70 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
71
72 const UTF8 *SourceNext = SourceStart;
73 std::vector<UTF32> Decoded(S.size(), 0);
74 UTF32 *TargetStart = Decoded.data();
75
76 auto Result =
77 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
78 Decoded.data() + Decoded.size(), lenientConversion);
79
80 Decoded.resize(TargetStart - Decoded.data());
81
82 return std::make_pair(Result, Decoded);
83}
84
85#define R0(RESULT) std::make_pair(RESULT, std::vector<unsigned>{})
86#define R(RESULT, ...) std::make_pair(RESULT, std::vector<unsigned>{ __VA_ARGS__ })
87
88TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
89
90 //
91 // 1-byte sequences
92 //
93
94 // U+0041 LATIN CAPITAL LETTER A
95 EXPECT_EQ(R(conversionOK, 0x0041),
96 ConvertUTF8ToUnicodeScalarsLenient("\x41"));
97
98 //
99 // 2-byte sequences
100 //
101
102 // U+0283 LATIN SMALL LETTER ESH
103 EXPECT_EQ(R(conversionOK, 0x0283),
104 ConvertUTF8ToUnicodeScalarsLenient("\xca\x83"));
105
106 // U+03BA GREEK SMALL LETTER KAPPA
107 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
108 // U+03C3 GREEK SMALL LETTER SIGMA
109 // U+03BC GREEK SMALL LETTER MU
110 // U+03B5 GREEK SMALL LETTER EPSILON
111 EXPECT_EQ(R(conversionOK, 0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
112 ConvertUTF8ToUnicodeScalarsLenient(
113 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
114
115 //
116 // 3-byte sequences
117 //
118
119 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
120 // U+6587 CJK UNIFIED IDEOGRAPH-6587
121 EXPECT_EQ(R(conversionOK, 0x4f8b, 0x6587),
122 ConvertUTF8ToUnicodeScalarsLenient("\xe4\xbe\x8b\xe6\x96\x87"));
123
124 // U+D55C HANGUL SYLLABLE HAN
125 // U+AE00 HANGUL SYLLABLE GEUL
126 EXPECT_EQ(R(conversionOK, 0xd55c, 0xae00),
127 ConvertUTF8ToUnicodeScalarsLenient("\xed\x95\x9c\xea\xb8\x80"));
128
129 // U+1112 HANGUL CHOSEONG HIEUH
130 // U+1161 HANGUL JUNGSEONG A
131 // U+11AB HANGUL JONGSEONG NIEUN
132 // U+1100 HANGUL CHOSEONG KIYEOK
133 // U+1173 HANGUL JUNGSEONG EU
134 // U+11AF HANGUL JONGSEONG RIEUL
135 EXPECT_EQ(R(conversionOK, 0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
136 ConvertUTF8ToUnicodeScalarsLenient(
137 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
138 "\xe1\x86\xaf"));
139
140 //
141 // 4-byte sequences
142 //
143
144 // U+E0100 VARIATION SELECTOR-17
145 EXPECT_EQ(R(conversionOK, 0x000E0100),
146 ConvertUTF8ToUnicodeScalarsLenient("\xf3\xa0\x84\x80"));
147
148 //
149 // First possible sequence of a certain length
150 //
151
152 // U+0000 NULL
153 EXPECT_EQ(R(conversionOK, 0x0000),
154 ConvertUTF8ToUnicodeScalarsLenient(StringRef("\x00", 1)));
155
156 // U+0080 PADDING CHARACTER
157 EXPECT_EQ(R(conversionOK, 0x0080),
158 ConvertUTF8ToUnicodeScalarsLenient("\xc2\x80"));
159
160 // U+0800 SAMARITAN LETTER ALAF
161 EXPECT_EQ(R(conversionOK, 0x0800),
162 ConvertUTF8ToUnicodeScalarsLenient("\xe0\xa0\x80"));
163
164 // U+10000 LINEAR B SYLLABLE B008 A
165 EXPECT_EQ(R(conversionOK, 0x10000),
166 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x90\x80\x80"));
167
168 // U+200000 (invalid)
169 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
170 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x88\x80\x80\x80"));
171
172 // U+4000000 (invalid)
173 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
174 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x84\x80\x80\x80\x80"));
175
176 //
177 // Last possible sequence of a certain length
178 //
179
180 // U+007F DELETE
181 EXPECT_EQ(R(conversionOK, 0x007f),
182 ConvertUTF8ToUnicodeScalarsLenient("\x7f"));
183
184 // U+07FF (unassigned)
185 EXPECT_EQ(R(conversionOK, 0x07ff),
186 ConvertUTF8ToUnicodeScalarsLenient("\xdf\xbf"));
187
188 // U+FFFF (noncharacter)
189 EXPECT_EQ(R(conversionOK, 0xffff),
190 ConvertUTF8ToUnicodeScalarsLenient("\xef\xbf\xbf"));
191
192 // U+1FFFFF (invalid)
193 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
194 ConvertUTF8ToUnicodeScalarsLenient("\xf7\xbf\xbf\xbf"));
195
196 // U+3FFFFFF (invalid)
197 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
198 ConvertUTF8ToUnicodeScalarsLenient("\xfb\xbf\xbf\xbf\xbf"));
199
200 // U+7FFFFFFF (invalid)
201 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
202 ConvertUTF8ToUnicodeScalarsLenient("\xfd\xbf\xbf\xbf\xbf\xbf"));
203
204 //
205 // Other boundary conditions
206 //
207
208 // U+D7FF (unassigned)
209 EXPECT_EQ(R(conversionOK, 0xd7ff),
210 ConvertUTF8ToUnicodeScalarsLenient("\xed\x9f\xbf"));
211
212 // U+E000 (private use)
213 EXPECT_EQ(R(conversionOK, 0xe000),
214 ConvertUTF8ToUnicodeScalarsLenient("\xee\x80\x80"));
215
216 // U+FFFD REPLACEMENT CHARACTER
217 EXPECT_EQ(R(conversionOK, 0xfffd),
218 ConvertUTF8ToUnicodeScalarsLenient("\xef\xbf\xbd"));
219
220 // U+10FFFF (noncharacter)
221 EXPECT_EQ(R(conversionOK, 0x10ffff),
222 ConvertUTF8ToUnicodeScalarsLenient("\xf4\x8f\xbf\xbf"));
223
224 // U+110000 (invalid)
225 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
226 ConvertUTF8ToUnicodeScalarsLenient("\xf4\x90\x80\x80"));
227
228 //
229 // Unexpected continuation bytes
230 //
231
232 // A sequence of unexpected continuation bytes that don't follow a first
233 // byte, every byte is a maximal subpart.
234
235 EXPECT_EQ(R(sourceIllegal, 0xfffd),
236 ConvertUTF8ToUnicodeScalarsLenient("\x80"));
237 EXPECT_EQ(R(sourceIllegal, 0xfffd),
238 ConvertUTF8ToUnicodeScalarsLenient("\xbf"));
239 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
240 ConvertUTF8ToUnicodeScalarsLenient("\x80\x80"));
241 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
242 ConvertUTF8ToUnicodeScalarsLenient("\x80\xbf"));
243 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
244 ConvertUTF8ToUnicodeScalarsLenient("\xbf\x80"));
245 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
246 ConvertUTF8ToUnicodeScalarsLenient("\x80\xbf\x80"));
247 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
248 ConvertUTF8ToUnicodeScalarsLenient("\x80\xbf\x80\xbf"));
249 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
250 ConvertUTF8ToUnicodeScalarsLenient("\x80\xbf\x82\xbf\xaa"));
251 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
252 ConvertUTF8ToUnicodeScalarsLenient("\xaa\xb0\xbb\xbf\xaa\xa0"));
253 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
254 0xfffd),
255 ConvertUTF8ToUnicodeScalarsLenient("\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
256
257 // All continuation bytes (0x80--0xbf).
258 EXPECT_EQ(R(sourceIllegal,
259 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
260 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
261 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
262 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
263 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
264 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
265 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
266 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
267 ConvertUTF8ToUnicodeScalarsLenient(
268 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
269 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
270 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
271 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
272
273 //
274 // Lonely start bytes
275 //
276
277 // Start bytes of 2-byte sequences (0xc0--0xdf).
278 EXPECT_EQ(R(sourceIllegal,
279 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
280 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
281 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
282 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
283 ConvertUTF8ToUnicodeScalarsLenient(
284 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
285 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
286
287 EXPECT_EQ(R(sourceIllegal,
288 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
289 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
290 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
291 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
292 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
293 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
294 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
295 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020),
296 ConvertUTF8ToUnicodeScalarsLenient(
297 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
298 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
299 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
300 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
301
302 // Start bytes of 3-byte sequences (0xe0--0xef).
303 EXPECT_EQ(R(sourceIllegal,
304 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
305 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
306 ConvertUTF8ToUnicodeScalarsLenient(
307 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
308
309 EXPECT_EQ(R(sourceIllegal,
310 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
311 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
312 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
313 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020),
314 ConvertUTF8ToUnicodeScalarsLenient(
315 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
316 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
317
318 // Start bytes of 4-byte sequences (0xf0--0xf7).
319 EXPECT_EQ(R(sourceIllegal,
320 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
321 ConvertUTF8ToUnicodeScalarsLenient("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
322
323 EXPECT_EQ(R(sourceIllegal,
324 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020,
325 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020),
326 ConvertUTF8ToUnicodeScalarsLenient(
327 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
328
329 // Start bytes of 5-byte sequences (0xf8--0xfb).
330 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
331 ConvertUTF8ToUnicodeScalarsLenient("\xf8\xf9\xfa\xfb"));
332
333 EXPECT_EQ(R(sourceIllegal,
334 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020),
335 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
336
337 // Start bytes of 6-byte sequences (0xfc--0xfd).
338 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
339 ConvertUTF8ToUnicodeScalarsLenient("\xfc\xfd"));
340
341 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0x0020, 0xfffd, 0x0020),
342 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x20\xfd\x20"));
343
344 //
345 // Other bytes (0xc0--0xc1, 0xfe--0xff).
346 //
347
348 EXPECT_EQ(R(sourceIllegal, 0xfffd),
349 ConvertUTF8ToUnicodeScalarsLenient("\xc0"));
350 EXPECT_EQ(R(sourceIllegal, 0xfffd),
351 ConvertUTF8ToUnicodeScalarsLenient("\xc1"));
352 EXPECT_EQ(R(sourceIllegal, 0xfffd),
353 ConvertUTF8ToUnicodeScalarsLenient("\xfe"));
354 EXPECT_EQ(R(sourceIllegal, 0xfffd),
355 ConvertUTF8ToUnicodeScalarsLenient("\xff"));
356
357 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
358 ConvertUTF8ToUnicodeScalarsLenient("\xc0\xc1\xfe\xff"));
359
360 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
361 ConvertUTF8ToUnicodeScalarsLenient("\xfe\xfe\xff\xff"));
362
363 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
364 ConvertUTF8ToUnicodeScalarsLenient("\xfe\x80\x80\x80\x80\x80"));
365
366 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
367 ConvertUTF8ToUnicodeScalarsLenient("\xff\x80\x80\x80\x80\x80"));
368
369 EXPECT_EQ(R(sourceIllegal,
370 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020, 0xfffd, 0x0020),
371 ConvertUTF8ToUnicodeScalarsLenient("\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
372
373 //
374 // Sequences with one continuation byte missing
375 //
376
377 EXPECT_EQ(R(sourceIllegal, 0xfffd),
378 ConvertUTF8ToUnicodeScalarsLenient("\xc2"));
379 EXPECT_EQ(R(sourceIllegal, 0xfffd),
380 ConvertUTF8ToUnicodeScalarsLenient("\xdf"));
381 EXPECT_EQ(R(sourceIllegal, 0xfffd),
382 ConvertUTF8ToUnicodeScalarsLenient("\xe0\xa0"));
383 EXPECT_EQ(R(sourceIllegal, 0xfffd),
384 ConvertUTF8ToUnicodeScalarsLenient("\xe0\xbf"));
385 EXPECT_EQ(R(sourceIllegal, 0xfffd),
386 ConvertUTF8ToUnicodeScalarsLenient("\xe1\x80"));
387 EXPECT_EQ(R(sourceIllegal, 0xfffd),
388 ConvertUTF8ToUnicodeScalarsLenient("\xec\xbf"));
389 EXPECT_EQ(R(sourceIllegal, 0xfffd),
390 ConvertUTF8ToUnicodeScalarsLenient("\xed\x80"));
391 EXPECT_EQ(R(sourceIllegal, 0xfffd),
392 ConvertUTF8ToUnicodeScalarsLenient("\xed\x9f"));
393 EXPECT_EQ(R(sourceIllegal, 0xfffd),
394 ConvertUTF8ToUnicodeScalarsLenient("\xee\x80"));
395 EXPECT_EQ(R(sourceIllegal, 0xfffd),
396 ConvertUTF8ToUnicodeScalarsLenient("\xef\xbf"));
397 EXPECT_EQ(R(sourceIllegal, 0xfffd),
398 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x90\x80"));
399 EXPECT_EQ(R(sourceIllegal, 0xfffd),
400 ConvertUTF8ToUnicodeScalarsLenient("\xf0\xbf\xbf"));
401 EXPECT_EQ(R(sourceIllegal, 0xfffd),
402 ConvertUTF8ToUnicodeScalarsLenient("\xf1\x80\x80"));
403 EXPECT_EQ(R(sourceIllegal, 0xfffd),
404 ConvertUTF8ToUnicodeScalarsLenient("\xf3\xbf\xbf"));
405 EXPECT_EQ(R(sourceIllegal, 0xfffd),
406 ConvertUTF8ToUnicodeScalarsLenient("\xf4\x80\x80"));
407 EXPECT_EQ(R(sourceIllegal, 0xfffd),
408 ConvertUTF8ToUnicodeScalarsLenient("\xf4\x8f\xbf"));
409
410 // Overlong sequences with one trailing byte missing.
411 EXPECT_EQ(R(sourceIllegal, 0xfffd),
412 ConvertUTF8ToUnicodeScalarsLenient("\xc0"));
413 EXPECT_EQ(R(sourceIllegal, 0xfffd),
414 ConvertUTF8ToUnicodeScalarsLenient("\xc1"));
415 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
416 ConvertUTF8ToUnicodeScalarsLenient("\xe0\x80"));
417 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
418 ConvertUTF8ToUnicodeScalarsLenient("\xe0\x9f"));
419 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
420 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x80\x80"));
421 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
422 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x8f\x80"));
423 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
424 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x80\x80\x80"));
425 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
426 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80\x80\x80\x80"));
427
428 // Sequences that represent surrogates with one trailing byte missing.
429 // High surrogates
430 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
431 ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0"));
432 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
433 ConvertUTF8ToUnicodeScalarsLenient("\xed\xac"));
434 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
435 ConvertUTF8ToUnicodeScalarsLenient("\xed\xaf"));
436 // Low surrogates
437 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
438 ConvertUTF8ToUnicodeScalarsLenient("\xed\xb0"));
439 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
440 ConvertUTF8ToUnicodeScalarsLenient("\xed\xb4"));
441 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
442 ConvertUTF8ToUnicodeScalarsLenient("\xed\xbf"));
443
444 // Ill-formed 4-byte sequences.
445 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
446 // U+1100xx (invalid)
447 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
448 ConvertUTF8ToUnicodeScalarsLenient("\xf4\x90\x80"));
449 // U+13FBxx (invalid)
450 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
451 ConvertUTF8ToUnicodeScalarsLenient("\xf4\xbf\xbf"));
452 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
453 ConvertUTF8ToUnicodeScalarsLenient("\xf5\x80\x80"));
454 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
455 ConvertUTF8ToUnicodeScalarsLenient("\xf6\x80\x80"));
456 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
457 ConvertUTF8ToUnicodeScalarsLenient("\xf7\x80\x80"));
458 // U+1FFBxx (invalid)
459 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
460 ConvertUTF8ToUnicodeScalarsLenient("\xf7\xbf\xbf"));
461
462 // Ill-formed 5-byte sequences.
463 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
464 // U+2000xx (invalid)
465 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
466 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x88\x80\x80"));
467 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
468 ConvertUTF8ToUnicodeScalarsLenient("\xf8\xbf\xbf\xbf"));
469 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
470 ConvertUTF8ToUnicodeScalarsLenient("\xf9\x80\x80\x80"));
471 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
472 ConvertUTF8ToUnicodeScalarsLenient("\xfa\x80\x80\x80"));
473 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
474 ConvertUTF8ToUnicodeScalarsLenient("\xfb\x80\x80\x80"));
475 // U+3FFFFxx (invalid)
476 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
477 ConvertUTF8ToUnicodeScalarsLenient("\xfb\xbf\xbf\xbf"));
478
479 // Ill-formed 6-byte sequences.
480 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
481 // U+40000xx (invalid)
482 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
483 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x84\x80\x80\x80"));
484 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
485 ConvertUTF8ToUnicodeScalarsLenient("\xfc\xbf\xbf\xbf\xbf"));
486 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
487 ConvertUTF8ToUnicodeScalarsLenient("\xfd\x80\x80\x80\x80"));
488 // U+7FFFFFxx (invalid)
489 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
490 ConvertUTF8ToUnicodeScalarsLenient("\xfd\xbf\xbf\xbf\xbf"));
491
492 //
493 // Sequences with two continuation bytes missing
494 //
495
496 EXPECT_EQ(R(sourceIllegal, 0xfffd),
497 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x90"));
498 EXPECT_EQ(R(sourceIllegal, 0xfffd),
499 ConvertUTF8ToUnicodeScalarsLenient("\xf0\xbf"));
500 EXPECT_EQ(R(sourceIllegal, 0xfffd),
501 ConvertUTF8ToUnicodeScalarsLenient("\xf1\x80"));
502 EXPECT_EQ(R(sourceIllegal, 0xfffd),
503 ConvertUTF8ToUnicodeScalarsLenient("\xf3\xbf"));
504 EXPECT_EQ(R(sourceIllegal, 0xfffd),
505 ConvertUTF8ToUnicodeScalarsLenient("\xf4\x80"));
506 EXPECT_EQ(R(sourceIllegal, 0xfffd),
507 ConvertUTF8ToUnicodeScalarsLenient("\xf4\x8f"));
508
509 // Overlong sequences with two trailing byte missing.
510 EXPECT_EQ(R(sourceIllegal, 0xfffd),
511 ConvertUTF8ToUnicodeScalarsLenient("\xe0"));
512 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
513 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x80"));
514 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
515 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x8f"));
516 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
517 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x80\x80"));
518 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
519 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80\x80\x80"));
520
521 // Sequences that represent surrogates with two trailing bytes missing.
522 EXPECT_EQ(R(sourceIllegal, 0xfffd),
523 ConvertUTF8ToUnicodeScalarsLenient("\xed"));
524
525 // Ill-formed 4-byte sequences.
526 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
527 // U+110yxx (invalid)
528 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
529 ConvertUTF8ToUnicodeScalarsLenient("\xf4\x90"));
530 // U+13Fyxx (invalid)
531 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
532 ConvertUTF8ToUnicodeScalarsLenient("\xf4\xbf"));
533 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
534 ConvertUTF8ToUnicodeScalarsLenient("\xf5\x80"));
535 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
536 ConvertUTF8ToUnicodeScalarsLenient("\xf6\x80"));
537 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
538 ConvertUTF8ToUnicodeScalarsLenient("\xf7\x80"));
539 // U+1FFyxx (invalid)
540 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
541 ConvertUTF8ToUnicodeScalarsLenient("\xf7\xbf"));
542
543 // Ill-formed 5-byte sequences.
544 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
545 // U+200yxx (invalid)
546 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
547 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x88\x80"));
548 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
549 ConvertUTF8ToUnicodeScalarsLenient("\xf8\xbf\xbf"));
550 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
551 ConvertUTF8ToUnicodeScalarsLenient("\xf9\x80\x80"));
552 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
553 ConvertUTF8ToUnicodeScalarsLenient("\xfa\x80\x80"));
554 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
555 ConvertUTF8ToUnicodeScalarsLenient("\xfb\x80\x80"));
556 // U+3FFFyxx (invalid)
557 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
558 ConvertUTF8ToUnicodeScalarsLenient("\xfb\xbf\xbf"));
559
560 // Ill-formed 6-byte sequences.
561 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
562 // U+4000yxx (invalid)
563 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
564 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x84\x80\x80"));
565 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
566 ConvertUTF8ToUnicodeScalarsLenient("\xfc\xbf\xbf\xbf"));
567 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
568 ConvertUTF8ToUnicodeScalarsLenient("\xfd\x80\x80\x80"));
569 // U+7FFFFyxx (invalid)
570 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
571 ConvertUTF8ToUnicodeScalarsLenient("\xfd\xbf\xbf\xbf"));
572
573 //
574 // Sequences with three continuation bytes missing
575 //
576
577 EXPECT_EQ(R(sourceIllegal, 0xfffd),
578 ConvertUTF8ToUnicodeScalarsLenient("\xf0"));
579 EXPECT_EQ(R(sourceIllegal, 0xfffd),
580 ConvertUTF8ToUnicodeScalarsLenient("\xf1"));
581 EXPECT_EQ(R(sourceIllegal, 0xfffd),
582 ConvertUTF8ToUnicodeScalarsLenient("\xf2"));
583 EXPECT_EQ(R(sourceIllegal, 0xfffd),
584 ConvertUTF8ToUnicodeScalarsLenient("\xf3"));
585 EXPECT_EQ(R(sourceIllegal, 0xfffd),
586 ConvertUTF8ToUnicodeScalarsLenient("\xf4"));
587
588 // Broken overlong sequences.
589 EXPECT_EQ(R(sourceIllegal, 0xfffd),
590 ConvertUTF8ToUnicodeScalarsLenient("\xf0"));
591 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
592 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x80"));
593 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
594 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80\x80"));
595
596 // Ill-formed 4-byte sequences.
597 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
598 // U+14yyxx (invalid)
599 EXPECT_EQ(R(sourceIllegal, 0xfffd),
600 ConvertUTF8ToUnicodeScalarsLenient("\xf5"));
601 EXPECT_EQ(R(sourceIllegal, 0xfffd),
602 ConvertUTF8ToUnicodeScalarsLenient("\xf6"));
603 // U+1Cyyxx (invalid)
604 EXPECT_EQ(R(sourceIllegal, 0xfffd),
605 ConvertUTF8ToUnicodeScalarsLenient("\xf7"));
606
607 // Ill-formed 5-byte sequences.
608 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
609 // U+20yyxx (invalid)
610 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
611 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x88"));
612 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
613 ConvertUTF8ToUnicodeScalarsLenient("\xf8\xbf"));
614 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
615 ConvertUTF8ToUnicodeScalarsLenient("\xf9\x80"));
616 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
617 ConvertUTF8ToUnicodeScalarsLenient("\xfa\x80"));
618 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
619 ConvertUTF8ToUnicodeScalarsLenient("\xfb\x80"));
620 // U+3FCyyxx (invalid)
621 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
622 ConvertUTF8ToUnicodeScalarsLenient("\xfb\xbf"));
623
624 // Ill-formed 6-byte sequences.
625 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
626 // U+400yyxx (invalid)
627 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
628 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x84\x80"));
629 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
630 ConvertUTF8ToUnicodeScalarsLenient("\xfc\xbf\xbf"));
631 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
632 ConvertUTF8ToUnicodeScalarsLenient("\xfd\x80\x80"));
633 // U+7FFCyyxx (invalid)
634 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
635 ConvertUTF8ToUnicodeScalarsLenient("\xfd\xbf\xbf"));
636
637 //
638 // Sequences with four continuation bytes missing
639 //
640
641 // Ill-formed 5-byte sequences.
642 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
643 // U+uzyyxx (invalid)
644 EXPECT_EQ(R(sourceIllegal, 0xfffd),
645 ConvertUTF8ToUnicodeScalarsLenient("\xf8"));
646 EXPECT_EQ(R(sourceIllegal, 0xfffd),
647 ConvertUTF8ToUnicodeScalarsLenient("\xf9"));
648 EXPECT_EQ(R(sourceIllegal, 0xfffd),
649 ConvertUTF8ToUnicodeScalarsLenient("\xfa"));
650 EXPECT_EQ(R(sourceIllegal, 0xfffd),
651 ConvertUTF8ToUnicodeScalarsLenient("\xfb"));
652 // U+3zyyxx (invalid)
653 EXPECT_EQ(R(sourceIllegal, 0xfffd),
654 ConvertUTF8ToUnicodeScalarsLenient("\xfb"));
655
656 // Broken overlong sequences.
657 EXPECT_EQ(R(sourceIllegal, 0xfffd),
658 ConvertUTF8ToUnicodeScalarsLenient("\xf8"));
659 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
660 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80"));
661
662 // Ill-formed 6-byte sequences.
663 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
664 // U+uzzyyxx (invalid)
665 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
666 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x84"));
667 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
668 ConvertUTF8ToUnicodeScalarsLenient("\xfc\xbf"));
669 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
670 ConvertUTF8ToUnicodeScalarsLenient("\xfd\x80"));
671 // U+7Fzzyyxx (invalid)
672 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
673 ConvertUTF8ToUnicodeScalarsLenient("\xfd\xbf"));
674
675 //
676 // Sequences with five continuation bytes missing
677 //
678
679 // Ill-formed 6-byte sequences.
680 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
681 // U+uzzyyxx (invalid)
682 EXPECT_EQ(R(sourceIllegal, 0xfffd),
683 ConvertUTF8ToUnicodeScalarsLenient("\xfc"));
684 // U+uuzzyyxx (invalid)
685 EXPECT_EQ(R(sourceIllegal, 0xfffd),
686 ConvertUTF8ToUnicodeScalarsLenient("\xfd"));
687
688 //
689 // Consecutive sequences with trailing bytes missing
690 //
691
692 EXPECT_EQ(R(sourceIllegal,
693 0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd, /**/
694 0xfffd, 0xfffd, 0xfffd, 0xfffd,
695 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
696 0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd, /**/
697 0xfffd, 0xfffd, 0xfffd, 0xfffd,
698 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
699 ConvertUTF8ToUnicodeScalarsLenient(
700 "\xc0" "\xe0\x80" "\xf0\x80\x80"
701 "\xf8\x80\x80\x80"
702 "\xfc\x80\x80\x80\x80"
703 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
704 "\xfb\xbf\xbf\xbf"
705 "\xfd\xbf\xbf\xbf\xbf"));
706
707
708 //
709 // Overlong UTF-8 sequences
710 //
711
712 // U+002F SOLIDUS
713 EXPECT_EQ(R(conversionOK, 0x002f),
714 ConvertUTF8ToUnicodeScalarsLenient("\x2f"));
715
716 // Overlong sequences of the above.
717 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
718 ConvertUTF8ToUnicodeScalarsLenient("\xc0\xaf"));
719 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
720 ConvertUTF8ToUnicodeScalarsLenient("\xe0\x80\xaf"));
721 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
722 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x80\x80\xaf"));
723 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
724 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x80\x80\x80\xaf"));
725 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
726 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80\x80\x80\x80\xaf"));
727
728 // U+0000 NULL
729 EXPECT_EQ(R(conversionOK, 0x0000),
730 ConvertUTF8ToUnicodeScalarsLenient(StringRef("\x00", 1)));
731
732 // Overlong sequences of the above.
733 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
734 ConvertUTF8ToUnicodeScalarsLenient("\xc0\x80"));
735 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
736 ConvertUTF8ToUnicodeScalarsLenient("\xe0\x80\x80"));
737 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
738 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x80\x80\x80"));
739 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
740 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x80\x80\x80\x80"));
741 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
742 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x80\x80\x80\x80\x80"));
743
744 // Other overlong sequences.
745 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
746 ConvertUTF8ToUnicodeScalarsLenient("\xc0\xbf"));
747 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
748 ConvertUTF8ToUnicodeScalarsLenient("\xc1\x80"));
749 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd),
750 ConvertUTF8ToUnicodeScalarsLenient("\xc1\xbf"));
751 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
752 ConvertUTF8ToUnicodeScalarsLenient("\xe0\x9f\xbf"));
753 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
754 ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0\x80"));
755 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
756 ConvertUTF8ToUnicodeScalarsLenient("\xed\xbf\xbf"));
757 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
758 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x8f\x80\x80"));
759 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
760 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x8f\xbf\xbf"));
761 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
762 ConvertUTF8ToUnicodeScalarsLenient("\xf8\x87\xbf\xbf\xbf"));
763 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
764 ConvertUTF8ToUnicodeScalarsLenient("\xfc\x83\xbf\xbf\xbf\xbf"));
765
766 //
767 // Isolated surrogates
768 //
769
770 // Unicode 6.3.0:
771 //
772 // D71. High-surrogate code point: A Unicode code point in the range
773 // U+D800 to U+DBFF.
774 //
775 // D73. Low-surrogate code point: A Unicode code point in the range
776 // U+DC00 to U+DFFF.
777
778 // Note: U+E0100 is <DB40 DD00> in UTF16.
779
780 // High surrogates
781
782 // U+D800
783 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
784 ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0\x80"));
785
786 // U+DB40
787 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
788 ConvertUTF8ToUnicodeScalarsLenient("\xed\xac\xa0"));
789
790 // U+DBFF
791 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
792 ConvertUTF8ToUnicodeScalarsLenient("\xed\xaf\xbf"));
793
794 // Low surrogates
795
796 // U+DC00
797 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
798 ConvertUTF8ToUnicodeScalarsLenient("\xed\xb0\x80"));
799
800 // U+DD00
801 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
802 ConvertUTF8ToUnicodeScalarsLenient("\xed\xb4\x80"));
803
804 // U+DFFF
805 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd),
806 ConvertUTF8ToUnicodeScalarsLenient("\xed\xbf\xbf"));
807
808 // Surrogate pairs
809
810 // U+D800 U+DC00
811 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
812 ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0\x80\xed\xb0\x80"));
813
814 // U+D800 U+DD00
815 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
816 ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0\x80\xed\xb4\x80"));
817
818 // U+D800 U+DFFF
819 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
820 ConvertUTF8ToUnicodeScalarsLenient("\xed\xa0\x80\xed\xbf\xbf"));
821
822 // U+DB40 U+DC00
823 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
824 ConvertUTF8ToUnicodeScalarsLenient("\xed\xac\xa0\xed\xb0\x80"));
825
826 // U+DB40 U+DD00
827 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
828 ConvertUTF8ToUnicodeScalarsLenient("\xed\xac\xa0\xed\xb4\x80"));
829
830 // U+DB40 U+DFFF
831 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
832 ConvertUTF8ToUnicodeScalarsLenient("\xed\xac\xa0\xed\xbf\xbf"));
833
834 // U+DBFF U+DC00
835 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
836 ConvertUTF8ToUnicodeScalarsLenient("\xed\xaf\xbf\xed\xb0\x80"));
837
838 // U+DBFF U+DD00
839 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
840 ConvertUTF8ToUnicodeScalarsLenient("\xed\xaf\xbf\xed\xb4\x80"));
841
842 // U+DBFF U+DFFF
843 EXPECT_EQ(R(sourceIllegal, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
844 ConvertUTF8ToUnicodeScalarsLenient("\xed\xaf\xbf\xed\xbf\xbf"));
845
846 //
847 // Noncharacters
848 //
849
850 // Unicode 6.3.0:
851 //
852 // D14. Noncharacter: A code point that is permanently reserved for
853 // internal use and that should never be interchanged. Noncharacters
854 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
855 // and the values U+FDD0..U+FDEF.
856
857 // U+FFFE
858 EXPECT_EQ(R(conversionOK, 0xfffe),
859 ConvertUTF8ToUnicodeScalarsLenient("\xef\xbf\xbe"));
860
861 // U+FFFF
862 EXPECT_EQ(R(conversionOK, 0xffff),
863 ConvertUTF8ToUnicodeScalarsLenient("\xef\xbf\xbf"));
864
865 // U+1FFFE
866 EXPECT_EQ(R(conversionOK, 0x1fffe),
867 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x9f\xbf\xbe"));
868
869 // U+1FFFF
870 EXPECT_EQ(R(conversionOK, 0x1ffff),
871 ConvertUTF8ToUnicodeScalarsLenient("\xf0\x9f\xbf\xbf"));
872
873 // U+2FFFE
874 EXPECT_EQ(R(conversionOK, 0x2fffe),
875 ConvertUTF8ToUnicodeScalarsLenient("\xf0\xaf\xbf\xbe"));
876
877 // U+2FFFF
878 EXPECT_EQ(R(conversionOK, 0x2ffff),
879 ConvertUTF8ToUnicodeScalarsLenient("\xf0\xaf\xbf\xbf"));
880
881 // U+3FFFE
882 EXPECT_EQ(R(conversionOK, 0x3fffe),
883 ConvertUTF8ToUnicodeScalarsLenient("\xf0\xbf\xbf\xbe"));
884
885 // U+3FFFF
886 EXPECT_EQ(R(conversionOK, 0x3ffff),
887 ConvertUTF8ToUnicodeScalarsLenient("\xf0\xbf\xbf\xbf"));
888
889 // U+4FFFE
890 EXPECT_EQ(R(conversionOK, 0x4fffe),
891 ConvertUTF8ToUnicodeScalarsLenient("\xf1\x8f\xbf\xbe"));
892
893 // U+4FFFF
894 EXPECT_EQ(R(conversionOK, 0x4ffff),
895 ConvertUTF8ToUnicodeScalarsLenient("\xf1\x8f\xbf\xbf"));
896
897 // U+5FFFE
898 EXPECT_EQ(R(conversionOK, 0x5fffe),
899 ConvertUTF8ToUnicodeScalarsLenient("\xf1\x9f\xbf\xbe"));
900
901 // U+5FFFF
902 EXPECT_EQ(R(conversionOK, 0x5ffff),
903 ConvertUTF8ToUnicodeScalarsLenient("\xf1\x9f\xbf\xbf"));
904
905 // U+6FFFE
906 EXPECT_EQ(R(conversionOK, 0x6fffe),
907 ConvertUTF8ToUnicodeScalarsLenient("\xf1\xaf\xbf\xbe"));
908
909 // U+6FFFF
910 EXPECT_EQ(R(conversionOK, 0x6ffff),
911 ConvertUTF8ToUnicodeScalarsLenient("\xf1\xaf\xbf\xbf"));
912
913 // U+7FFFE
914 EXPECT_EQ(R(conversionOK, 0x7fffe),
915 ConvertUTF8ToUnicodeScalarsLenient("\xf1\xbf\xbf\xbe"));
916
917 // U+7FFFF
918 EXPECT_EQ(R(conversionOK, 0x7ffff),
919 ConvertUTF8ToUnicodeScalarsLenient("\xf1\xbf\xbf\xbf"));
920
921 // U+8FFFE
922 EXPECT_EQ(R(conversionOK, 0x8fffe),
923 ConvertUTF8ToUnicodeScalarsLenient("\xf2\x8f\xbf\xbe"));
924
925 // U+8FFFF
926 EXPECT_EQ(R(conversionOK, 0x8ffff),
927 ConvertUTF8ToUnicodeScalarsLenient("\xf2\x8f\xbf\xbf"));
928
929 // U+9FFFE
930 EXPECT_EQ(R(conversionOK, 0x9fffe),
931 ConvertUTF8ToUnicodeScalarsLenient("\xf2\x9f\xbf\xbe"));
932
933 // U+9FFFF
934 EXPECT_EQ(R(conversionOK, 0x9ffff),
935 ConvertUTF8ToUnicodeScalarsLenient("\xf2\x9f\xbf\xbf"));
936
937 // U+AFFFE
938 EXPECT_EQ(R(conversionOK, 0xafffe),
939 ConvertUTF8ToUnicodeScalarsLenient("\xf2\xaf\xbf\xbe"));
940
941 // U+AFFFF
942 EXPECT_EQ(R(conversionOK, 0xaffff),
943 ConvertUTF8ToUnicodeScalarsLenient("\xf2\xaf\xbf\xbf"));
944
945 // U+BFFFE
946 EXPECT_EQ(R(conversionOK, 0xbfffe),
947 ConvertUTF8ToUnicodeScalarsLenient("\xf2\xbf\xbf\xbe"));
948
949 // U+BFFFF
950 EXPECT_EQ(R(conversionOK, 0xbffff),
951 ConvertUTF8ToUnicodeScalarsLenient("\xf2\xbf\xbf\xbf"));
952
953 // U+CFFFE
954 EXPECT_EQ(R(conversionOK, 0xcfffe),
955 ConvertUTF8ToUnicodeScalarsLenient("\xf3\x8f\xbf\xbe"));
956
957 // U+CFFFF
958 EXPECT_EQ(R(conversionOK, 0xcfffF),
959 ConvertUTF8ToUnicodeScalarsLenient("\xf3\x8f\xbf\xbf"));
960
961 // U+DFFFE
962 EXPECT_EQ(R(conversionOK, 0xdfffe),
963 ConvertUTF8ToUnicodeScalarsLenient("\xf3\x9f\xbf\xbe"));
964
965 // U+DFFFF
966 EXPECT_EQ(R(conversionOK, 0xdffff),
967 ConvertUTF8ToUnicodeScalarsLenient("\xf3\x9f\xbf\xbf"));
968
969 // U+EFFFE
970 EXPECT_EQ(R(conversionOK, 0xefffe),
971 ConvertUTF8ToUnicodeScalarsLenient("\xf3\xaf\xbf\xbe"));
972
973 // U+EFFFF
974 EXPECT_EQ(R(conversionOK, 0xeffff),
975 ConvertUTF8ToUnicodeScalarsLenient("\xf3\xaf\xbf\xbf"));
976
977 // U+FFFFE
978 EXPECT_EQ(R(conversionOK, 0xffffe),
979 ConvertUTF8ToUnicodeScalarsLenient("\xf3\xbf\xbf\xbe"));
980
981 // U+FFFFF
982 EXPECT_EQ(R(conversionOK, 0xfffff),
983 ConvertUTF8ToUnicodeScalarsLenient("\xf3\xbf\xbf\xbf"));
984
985 // U+10FFFE
986 EXPECT_EQ(R(conversionOK, 0x10fffe),
987 ConvertUTF8ToUnicodeScalarsLenient("\xf4\x8f\xbf\xbe"));
988
989 // U+10FFFF
990 EXPECT_EQ(R(conversionOK, 0x10ffff),
991 ConvertUTF8ToUnicodeScalarsLenient("\xf4\x8f\xbf\xbf"));
992
993 // U+FDD0
994 EXPECT_EQ(R(conversionOK, 0xfdd0),
995 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x90"));
996
997 // U+FDD1
998 EXPECT_EQ(R(conversionOK, 0xfdd1),
999 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x91"));
1000
1001 // U+FDD2
1002 EXPECT_EQ(R(conversionOK, 0xfdd2),
1003 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x92"));
1004
1005 // U+FDD3
1006 EXPECT_EQ(R(conversionOK, 0xfdd3),
1007 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x93"));
1008
1009 // U+FDD4
1010 EXPECT_EQ(R(conversionOK, 0xfdd4),
1011 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x94"));
1012
1013 // U+FDD5
1014 EXPECT_EQ(R(conversionOK, 0xfdd5),
1015 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x95"));
1016
1017 // U+FDD6
1018 EXPECT_EQ(R(conversionOK, 0xfdd6),
1019 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x96"));
1020
1021 // U+FDD7
1022 EXPECT_EQ(R(conversionOK, 0xfdd7),
1023 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x97"));
1024
1025 // U+FDD8
1026 EXPECT_EQ(R(conversionOK, 0xfdd8),
1027 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x98"));
1028
1029 // U+FDD9
1030 EXPECT_EQ(R(conversionOK, 0xfdd9),
1031 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x99"));
1032
1033 // U+FDDA
1034 EXPECT_EQ(R(conversionOK, 0xfdda),
1035 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9a"));
1036
1037 // U+FDDB
1038 EXPECT_EQ(R(conversionOK, 0xfddb),
1039 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9b"));
1040
1041 // U+FDDC
1042 EXPECT_EQ(R(conversionOK, 0xfddc),
1043 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9c"));
1044
1045 // U+FDDD
1046 EXPECT_EQ(R(conversionOK, 0xfddd),
1047 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9d"));
1048
1049 // U+FDDE
1050 EXPECT_EQ(R(conversionOK, 0xfdde),
1051 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9e"));
1052
1053 // U+FDDF
1054 EXPECT_EQ(R(conversionOK, 0xfddf),
1055 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\x9f"));
1056
1057 // U+FDE0
1058 EXPECT_EQ(R(conversionOK, 0xfde0),
1059 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa0"));
1060
1061 // U+FDE1
1062 EXPECT_EQ(R(conversionOK, 0xfde1),
1063 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa1"));
1064
1065 // U+FDE2
1066 EXPECT_EQ(R(conversionOK, 0xfde2),
1067 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa2"));
1068
1069 // U+FDE3
1070 EXPECT_EQ(R(conversionOK, 0xfde3),
1071 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa3"));
1072
1073 // U+FDE4
1074 EXPECT_EQ(R(conversionOK, 0xfde4),
1075 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa4"));
1076
1077 // U+FDE5
1078 EXPECT_EQ(R(conversionOK, 0xfde5),
1079 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa5"));
1080
1081 // U+FDE6
1082 EXPECT_EQ(R(conversionOK, 0xfde6),
1083 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa6"));
1084
1085 // U+FDE7
1086 EXPECT_EQ(R(conversionOK, 0xfde7),
1087 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa7"));
1088
1089 // U+FDE8
1090 EXPECT_EQ(R(conversionOK, 0xfde8),
1091 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa8"));
1092
1093 // U+FDE9
1094 EXPECT_EQ(R(conversionOK, 0xfde9),
1095 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xa9"));
1096
1097 // U+FDEA
1098 EXPECT_EQ(R(conversionOK, 0xfdea),
1099 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xaa"));
1100
1101 // U+FDEB
1102 EXPECT_EQ(R(conversionOK, 0xfdeb),
1103 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xab"));
1104
1105 // U+FDEC
1106 EXPECT_EQ(R(conversionOK, 0xfdec),
1107 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xac"));
1108
1109 // U+FDED
1110 EXPECT_EQ(R(conversionOK, 0xfded),
1111 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xad"));
1112
1113 // U+FDEE
1114 EXPECT_EQ(R(conversionOK, 0xfdee),
1115 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xae"));
1116
1117 // U+FDEF
1118 EXPECT_EQ(R(conversionOK, 0xfdef),
1119 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xaf"));
1120
1121 // U+FDF0
1122 EXPECT_EQ(R(conversionOK, 0xfdf0),
1123 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb0"));
1124
1125 // U+FDF1
1126 EXPECT_EQ(R(conversionOK, 0xfdf1),
1127 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb1"));
1128
1129 // U+FDF2
1130 EXPECT_EQ(R(conversionOK, 0xfdf2),
1131 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb2"));
1132
1133 // U+FDF3
1134 EXPECT_EQ(R(conversionOK, 0xfdf3),
1135 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb3"));
1136
1137 // U+FDF4
1138 EXPECT_EQ(R(conversionOK, 0xfdf4),
1139 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb4"));
1140
1141 // U+FDF5
1142 EXPECT_EQ(R(conversionOK, 0xfdf5),
1143 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb5"));
1144
1145 // U+FDF6
1146 EXPECT_EQ(R(conversionOK, 0xfdf6),
1147 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb6"));
1148
1149 // U+FDF7
1150 EXPECT_EQ(R(conversionOK, 0xfdf7),
1151 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb7"));
1152
1153 // U+FDF8
1154 EXPECT_EQ(R(conversionOK, 0xfdf8),
1155 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb8"));
1156
1157 // U+FDF9
1158 EXPECT_EQ(R(conversionOK, 0xfdf9),
1159 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xb9"));
1160
1161 // U+FDFA
1162 EXPECT_EQ(R(conversionOK, 0xfdfa),
1163 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xba"));
1164
1165 // U+FDFB
1166 EXPECT_EQ(R(conversionOK, 0xfdfb),
1167 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xbb"));
1168
1169 // U+FDFC
1170 EXPECT_EQ(R(conversionOK, 0xfdfc),
1171 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xbc"));
1172
1173 // U+FDFD
1174 EXPECT_EQ(R(conversionOK, 0xfdfd),
1175 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xbd"));
1176
1177 // U+FDFE
1178 EXPECT_EQ(R(conversionOK, 0xfdfe),
1179 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xbe"));
1180
1181 // U+FDFF
1182 EXPECT_EQ(R(conversionOK, 0xfdff),
1183 ConvertUTF8ToUnicodeScalarsLenient("\xef\xb7\xbf"));
1184}
1185
1186std::pair<ConversionResult, std::vector<unsigned>>
1187ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
1188 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
1189
1190 const UTF8 *SourceNext = SourceStart;
1191 std::vector<UTF32> Decoded(S.size(), 0);
1192 UTF32 *TargetStart = Decoded.data();
1193
1194 auto Result = ConvertUTF8toUTF32Partial(
1195 &SourceNext, SourceStart + S.size(), &TargetStart,
1196 Decoded.data() + Decoded.size(), lenientConversion);
1197
1198 Decoded.resize(TargetStart - Decoded.data());
1199
1200 return std::make_pair(Result, Decoded);
1201}
1202
1203TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1204 // U+0041 LATIN CAPITAL LETTER A
1205 EXPECT_EQ(R(conversionOK, 0x0041),
1206 ConvertUTF8ToUnicodeScalarsPartialLenient("\x41"));
1207
1208 //
1209 // Sequences with one continuation byte missing
1210 //
1211
1212 EXPECT_EQ(R0(sourceExhausted),
1213 ConvertUTF8ToUnicodeScalarsPartialLenient("\xc2"));
1214 EXPECT_EQ(R0(sourceExhausted),
1215 ConvertUTF8ToUnicodeScalarsPartialLenient("\xdf"));
1216 EXPECT_EQ(R0(sourceExhausted),
1217 ConvertUTF8ToUnicodeScalarsPartialLenient("\xe0\xa0"));
1218 EXPECT_EQ(R0(sourceExhausted),
1219 ConvertUTF8ToUnicodeScalarsPartialLenient("\xe0\xbf"));
1220 EXPECT_EQ(R0(sourceExhausted),
1221 ConvertUTF8ToUnicodeScalarsPartialLenient("\xe1\x80"));
1222 EXPECT_EQ(R0(sourceExhausted),
1223 ConvertUTF8ToUnicodeScalarsPartialLenient("\xec\xbf"));
1224 EXPECT_EQ(R0(sourceExhausted),
1225 ConvertUTF8ToUnicodeScalarsPartialLenient("\xed\x80"));
1226 EXPECT_EQ(R0(sourceExhausted),
1227 ConvertUTF8ToUnicodeScalarsPartialLenient("\xed\x9f"));
1228 EXPECT_EQ(R0(sourceExhausted),
1229 ConvertUTF8ToUnicodeScalarsPartialLenient("\xee\x80"));
1230 EXPECT_EQ(R0(sourceExhausted),
1231 ConvertUTF8ToUnicodeScalarsPartialLenient("\xef\xbf"));
1232 EXPECT_EQ(R0(sourceExhausted),
1233 ConvertUTF8ToUnicodeScalarsPartialLenient("\xf0\x90\x80"));
1234 EXPECT_EQ(R0(sourceExhausted),
1235 ConvertUTF8ToUnicodeScalarsPartialLenient("\xf0\xbf\xbf"));
1236 EXPECT_EQ(R0(sourceExhausted),
1237 ConvertUTF8ToUnicodeScalarsPartialLenient("\xf1\x80\x80"));
1238 EXPECT_EQ(R0(sourceExhausted),
1239 ConvertUTF8ToUnicodeScalarsPartialLenient("\xf3\xbf\xbf"));
1240 EXPECT_EQ(R0(sourceExhausted),
1241 ConvertUTF8ToUnicodeScalarsPartialLenient("\xf4\x80\x80"));
1242 EXPECT_EQ(R0(sourceExhausted),
1243 ConvertUTF8ToUnicodeScalarsPartialLenient("\xf4\x8f\xbf"));
1244
1245 EXPECT_EQ(R(sourceExhausted, 0x0041),
1246 ConvertUTF8ToUnicodeScalarsPartialLenient("\x41\xc2"));
1247}
1248
1249#undef R0
1250#undef R
1251