blob: 16c9bebfde1600b07ab3416e343fc1570798ecb1 [file] [log] [blame]
Reid Kleckner7df03c22013-07-16 17:14:33 +00001//===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "llvm/Support/ConvertUTF.h"
11#include "gtest/gtest.h"
12#include <string>
Dmitri Gribenko1089db02014-06-16 11:09:46 +000013#include <vector>
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +000014#include <utility>
Reid Kleckner7df03c22013-07-16 17:14:33 +000015
16using namespace llvm;
17
18TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
19 // Src is the look of disapproval.
20 static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
21 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
22 std::string Result;
23 bool Success = convertUTF16ToUTF8String(Ref, Result);
24 EXPECT_TRUE(Success);
25 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
26 EXPECT_EQ(Expected, Result);
27}
28
29TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
30 // Src is the look of disapproval.
31 static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
32 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
33 std::string Result;
34 bool Success = convertUTF16ToUTF8String(Ref, Result);
35 EXPECT_TRUE(Success);
36 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
37 EXPECT_EQ(Expected, Result);
38}
39
40TEST(ConvertUTFTest, OddLengthInput) {
41 std::string Result;
42 bool Success = convertUTF16ToUTF8String(ArrayRef<char>("xxxxx", 5), Result);
43 EXPECT_FALSE(Success);
44}
45
46TEST(ConvertUTFTest, Empty) {
47 std::string Result;
48 bool Success = convertUTF16ToUTF8String(ArrayRef<char>(), Result);
49 EXPECT_TRUE(Success);
50 EXPECT_TRUE(Result.empty());
51}
52
53TEST(ConvertUTFTest, HasUTF16BOM) {
54 bool HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xff\xfe", 2));
55 EXPECT_TRUE(HasBOM);
56 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff", 2));
57 EXPECT_TRUE(HasBOM);
58 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff ", 3));
59 EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
60 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff\x00asdf", 6));
61 EXPECT_TRUE(HasBOM);
62
63 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>());
64 EXPECT_FALSE(HasBOM);
65 HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe", 1));
66 EXPECT_FALSE(HasBOM);
67}
Dmitri Gribenko1089db02014-06-16 11:09:46 +000068
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +000069struct ConvertUTFResultContainer {
70 ConversionResult ErrorCode;
71 std::vector<unsigned> UnicodeScalars;
72
73 ConvertUTFResultContainer(ConversionResult ErrorCode)
74 : ErrorCode(ErrorCode) {}
75
76 ConvertUTFResultContainer
77 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
78 unsigned US2 = 0x110000, unsigned US3 = 0x110000,
79 unsigned US4 = 0x110000, unsigned US5 = 0x110000,
80 unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
81 ConvertUTFResultContainer Result(*this);
82 if (US0 != 0x110000)
83 Result.UnicodeScalars.push_back(US0);
84 if (US1 != 0x110000)
85 Result.UnicodeScalars.push_back(US1);
86 if (US2 != 0x110000)
87 Result.UnicodeScalars.push_back(US2);
88 if (US3 != 0x110000)
89 Result.UnicodeScalars.push_back(US3);
90 if (US4 != 0x110000)
91 Result.UnicodeScalars.push_back(US4);
92 if (US5 != 0x110000)
93 Result.UnicodeScalars.push_back(US5);
94 if (US6 != 0x110000)
95 Result.UnicodeScalars.push_back(US6);
96 if (US7 != 0x110000)
97 Result.UnicodeScalars.push_back(US7);
98 return Result;
99 }
100};
101
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000102std::pair<ConversionResult, std::vector<unsigned>>
103ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
104 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
105
106 const UTF8 *SourceNext = SourceStart;
107 std::vector<UTF32> Decoded(S.size(), 0);
108 UTF32 *TargetStart = Decoded.data();
109
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000110 auto ErrorCode =
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000111 ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
112 Decoded.data() + Decoded.size(), lenientConversion);
113
114 Decoded.resize(TargetStart - Decoded.data());
115
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000116 return std::make_pair(ErrorCode, Decoded);
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000117}
118
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000119std::pair<ConversionResult, std::vector<unsigned>>
120ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
121 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
122
123 const UTF8 *SourceNext = SourceStart;
124 std::vector<UTF32> Decoded(S.size(), 0);
125 UTF32 *TargetStart = Decoded.data();
126
127 auto ErrorCode = ConvertUTF8toUTF32Partial(
128 &SourceNext, SourceStart + S.size(), &TargetStart,
129 Decoded.data() + Decoded.size(), lenientConversion);
130
131 Decoded.resize(TargetStart - Decoded.data());
132
133 return std::make_pair(ErrorCode, Decoded);
134}
135
136::testing::AssertionResult
137CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
138 StringRef S, bool Partial = false) {
139 ConversionResult ErrorCode;
140 std::vector<unsigned> Decoded;
141 if (!Partial)
142 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
143 else
144
145 std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
146 if (Expected.ErrorCode != ErrorCode)
147 return ::testing::AssertionFailure() << "Expected error code "
148 << Expected.ErrorCode << ", actual "
149 << ErrorCode;
150
151 if (Expected.UnicodeScalars != Decoded)
152 return ::testing::AssertionFailure()
153 << "Expected lenient decoded result:\n"
154 << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
155 << "Actual result:\n" << ::testing::PrintToString(Decoded);
156
157 return ::testing::AssertionSuccess();
158}
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000159
160TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
161
162 //
163 // 1-byte sequences
164 //
165
166 // U+0041 LATIN CAPITAL LETTER A
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000167 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
168 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000169
170 //
171 // 2-byte sequences
172 //
173
174 // U+0283 LATIN SMALL LETTER ESH
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000175 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
176 ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
177 "\xca\x83"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000178
179 // U+03BA GREEK SMALL LETTER KAPPA
180 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
181 // U+03C3 GREEK SMALL LETTER SIGMA
182 // U+03BC GREEK SMALL LETTER MU
183 // U+03B5 GREEK SMALL LETTER EPSILON
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000184 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
185 ConvertUTFResultContainer(conversionOK)
186 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
187 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000188
189 //
190 // 3-byte sequences
191 //
192
193 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
194 // U+6587 CJK UNIFIED IDEOGRAPH-6587
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000195 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
196 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
197 "\xe4\xbe\x8b\xe6\x96\x87"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000198
199 // U+D55C HANGUL SYLLABLE HAN
200 // U+AE00 HANGUL SYLLABLE GEUL
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000201 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
202 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
203 "\xed\x95\x9c\xea\xb8\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000204
205 // U+1112 HANGUL CHOSEONG HIEUH
206 // U+1161 HANGUL JUNGSEONG A
207 // U+11AB HANGUL JONGSEONG NIEUN
208 // U+1100 HANGUL CHOSEONG KIYEOK
209 // U+1173 HANGUL JUNGSEONG EU
210 // U+11AF HANGUL JONGSEONG RIEUL
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000211 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
212 ConvertUTFResultContainer(conversionOK)
213 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
214 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
215 "\xe1\x86\xaf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000216
217 //
218 // 4-byte sequences
219 //
220
221 // U+E0100 VARIATION SELECTOR-17
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
223 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
224 "\xf3\xa0\x84\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000225
226 //
227 // First possible sequence of a certain length
228 //
229
230 // U+0000 NULL
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000231 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
232 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
233 StringRef("\x00", 1)));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000234
235 // U+0080 PADDING CHARACTER
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000236 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
237 ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
238 "\xc2\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000239
240 // U+0800 SAMARITAN LETTER ALAF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000241 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
242 ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
243 "\xe0\xa0\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000244
245 // U+10000 LINEAR B SYLLABLE B008 A
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
247 ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
248 "\xf0\x90\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000249
250 // U+200000 (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000251 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
252 ConvertUTFResultContainer(sourceIllegal)
253 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
254 "\xf8\x88\x80\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000255
256 // U+4000000 (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000257 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
258 ConvertUTFResultContainer(sourceIllegal)
259 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
260 "\xfc\x84\x80\x80\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000261
262 //
263 // Last possible sequence of a certain length
264 //
265
266 // U+007F DELETE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000267 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
268 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000269
270 // U+07FF (unassigned)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000271 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
272 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
273 "\xdf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000274
275 // U+FFFF (noncharacter)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000276 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
277 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
278 "\xef\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000279
280 // U+1FFFFF (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000281 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
282 ConvertUTFResultContainer(sourceIllegal)
283 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
284 "\xf7\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000285
286 // U+3FFFFFF (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000287 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
288 ConvertUTFResultContainer(sourceIllegal)
289 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
290 "\xfb\xbf\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000291
292 // U+7FFFFFFF (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
294 ConvertUTFResultContainer(sourceIllegal)
295 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
296 "\xfd\xbf\xbf\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000297
298 //
299 // Other boundary conditions
300 //
301
302 // U+D7FF (unassigned)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
304 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
305 "\xed\x9f\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000306
307 // U+E000 (private use)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
309 ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
310 "\xee\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000311
312 // U+FFFD REPLACEMENT CHARACTER
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
314 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
315 "\xef\xbf\xbd"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000316
317 // U+10FFFF (noncharacter)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
319 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
320 "\xf4\x8f\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000321
322 // U+110000 (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
324 ConvertUTFResultContainer(sourceIllegal)
325 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
326 "\xf4\x90\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000327
328 //
329 // Unexpected continuation bytes
330 //
331
332 // A sequence of unexpected continuation bytes that don't follow a first
333 // byte, every byte is a maximal subpart.
334
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000335 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
336 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
337 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
338 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
339 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
340 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
341 "\x80\x80"));
342 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
343 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
344 "\x80\xbf"));
345 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
346 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
347 "\xbf\x80"));
348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
349 ConvertUTFResultContainer(sourceIllegal)
350 .withScalars(0xfffd, 0xfffd, 0xfffd),
351 "\x80\xbf\x80"));
352 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
353 ConvertUTFResultContainer(sourceIllegal)
354 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
355 "\x80\xbf\x80\xbf"));
356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
357 ConvertUTFResultContainer(sourceIllegal)
358 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
359 "\x80\xbf\x82\xbf\xaa"));
360 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
361 ConvertUTFResultContainer(sourceIllegal)
362 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
363 "\xaa\xb0\xbb\xbf\xaa\xa0"));
364 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
365 ConvertUTFResultContainer(sourceIllegal)
366 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
367 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000368
369 // All continuation bytes (0x80--0xbf).
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000370 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
371 ConvertUTFResultContainer(sourceIllegal)
372 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
373 0xfffd, 0xfffd, 0xfffd, 0xfffd)
374 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
375 0xfffd, 0xfffd, 0xfffd, 0xfffd)
376 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
377 0xfffd, 0xfffd, 0xfffd, 0xfffd)
378 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
379 0xfffd, 0xfffd, 0xfffd, 0xfffd)
380 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
381 0xfffd, 0xfffd, 0xfffd, 0xfffd)
382 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
383 0xfffd, 0xfffd, 0xfffd, 0xfffd)
384 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
385 0xfffd, 0xfffd, 0xfffd, 0xfffd)
386 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
387 0xfffd, 0xfffd, 0xfffd, 0xfffd),
388 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
389 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
390 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
391 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000392
393 //
394 // Lonely start bytes
395 //
396
397 // Start bytes of 2-byte sequences (0xc0--0xdf).
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000398 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
399 ConvertUTFResultContainer(sourceIllegal)
400 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
401 0xfffd, 0xfffd, 0xfffd, 0xfffd)
402 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
403 0xfffd, 0xfffd, 0xfffd, 0xfffd)
404 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
405 0xfffd, 0xfffd, 0xfffd, 0xfffd)
406 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
407 0xfffd, 0xfffd, 0xfffd, 0xfffd),
408 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
409 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000410
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
412 ConvertUTFResultContainer(sourceIllegal)
413 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
414 0xfffd, 0x0020, 0xfffd, 0x0020)
415 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
416 0xfffd, 0x0020, 0xfffd, 0x0020)
417 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
418 0xfffd, 0x0020, 0xfffd, 0x0020)
419 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
420 0xfffd, 0x0020, 0xfffd, 0x0020)
421 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
422 0xfffd, 0x0020, 0xfffd, 0x0020)
423 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
424 0xfffd, 0x0020, 0xfffd, 0x0020)
425 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
426 0xfffd, 0x0020, 0xfffd, 0x0020)
427 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
428 0xfffd, 0x0020, 0xfffd, 0x0020),
429 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
430 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
431 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
432 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000433
434 // Start bytes of 3-byte sequences (0xe0--0xef).
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000435 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
436 ConvertUTFResultContainer(sourceIllegal)
437 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
438 0xfffd, 0xfffd, 0xfffd, 0xfffd)
439 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
440 0xfffd, 0xfffd, 0xfffd, 0xfffd),
441 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000442
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000443 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
444 ConvertUTFResultContainer(sourceIllegal)
445 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
446 0xfffd, 0x0020, 0xfffd, 0x0020)
447 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
448 0xfffd, 0x0020, 0xfffd, 0x0020)
449 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
450 0xfffd, 0x0020, 0xfffd, 0x0020)
451 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
452 0xfffd, 0x0020, 0xfffd, 0x0020),
453 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
454 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000455
456 // Start bytes of 4-byte sequences (0xf0--0xf7).
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000457 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
458 ConvertUTFResultContainer(sourceIllegal)
459 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
460 0xfffd, 0xfffd, 0xfffd, 0xfffd),
461 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000462
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000463 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
464 ConvertUTFResultContainer(sourceIllegal)
465 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
466 0xfffd, 0x0020, 0xfffd, 0x0020)
467 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
468 0xfffd, 0x0020, 0xfffd, 0x0020),
469 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000470
471 // Start bytes of 5-byte sequences (0xf8--0xfb).
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000472 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
473 ConvertUTFResultContainer(sourceIllegal)
474 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
475 "\xf8\xf9\xfa\xfb"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000476
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000477 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
478 ConvertUTFResultContainer(sourceIllegal)
479 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
480 0xfffd, 0x0020, 0xfffd, 0x0020),
481 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000482
483 // Start bytes of 6-byte sequences (0xfc--0xfd).
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000484 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
485 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
486 "\xfc\xfd"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000487
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000488 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
489 ConvertUTFResultContainer(sourceIllegal)
490 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
491 "\xfc\x20\xfd\x20"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000492
493 //
494 // Other bytes (0xc0--0xc1, 0xfe--0xff).
495 //
496
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000497 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
498 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
499 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
500 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
501 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
502 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
503 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
504 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000505
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000506 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
507 ConvertUTFResultContainer(sourceIllegal)
508 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
509 "\xc0\xc1\xfe\xff"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000510
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
512 ConvertUTFResultContainer(sourceIllegal)
513 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
514 "\xfe\xfe\xff\xff"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000515
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000516 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
517 ConvertUTFResultContainer(sourceIllegal)
518 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
519 "\xfe\x80\x80\x80\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000520
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000521 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
522 ConvertUTFResultContainer(sourceIllegal)
523 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
524 "\xff\x80\x80\x80\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000525
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000526 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
527 ConvertUTFResultContainer(sourceIllegal)
528 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
529 0xfffd, 0x0020, 0xfffd, 0x0020),
530 "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000531
532 //
533 // Sequences with one continuation byte missing
534 //
535
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000536 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
537 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
538 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
539 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
540 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
541 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
542 "\xe0\xa0"));
543 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
544 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
545 "\xe0\xbf"));
546 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
547 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
548 "\xe1\x80"));
549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
550 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
551 "\xec\xbf"));
552 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
553 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
554 "\xed\x80"));
555 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
556 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
557 "\xed\x9f"));
558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
559 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
560 "\xee\x80"));
561 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
562 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
563 "\xef\xbf"));
564 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
565 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
566 "\xf0\x90\x80"));
567 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
568 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
569 "\xf0\xbf\xbf"));
570 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
571 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
572 "\xf1\x80\x80"));
573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
574 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
575 "\xf3\xbf\xbf"));
576 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
577 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
578 "\xf4\x80\x80"));
579 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
580 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
581 "\xf4\x8f\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000582
583 // Overlong sequences with one trailing byte missing.
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000584 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
585 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
586 "\xc0"));
587 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
589 "\xc1"));
590 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
591 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
592 "\xe0\x80"));
593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
595 "\xe0\x9f"));
596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
597 ConvertUTFResultContainer(sourceIllegal)
598 .withScalars(0xfffd, 0xfffd, 0xfffd),
599 "\xf0\x80\x80"));
600 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
601 ConvertUTFResultContainer(sourceIllegal)
602 .withScalars(0xfffd, 0xfffd, 0xfffd),
603 "\xf0\x8f\x80"));
604 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
605 ConvertUTFResultContainer(sourceIllegal)
606 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
607 "\xf8\x80\x80\x80"));
608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
609 ConvertUTFResultContainer(sourceIllegal)
610 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
611 "\xfc\x80\x80\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000612
613 // Sequences that represent surrogates with one trailing byte missing.
614 // High surrogates
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000615 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
616 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
617 "\xed\xa0"));
618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
620 "\xed\xac"));
621 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
622 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
623 "\xed\xaf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000624 // Low surrogates
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000625 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
626 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
627 "\xed\xb0"));
628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
629 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
630 "\xed\xb4"));
631 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
632 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
633 "\xed\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000634
635 // Ill-formed 4-byte sequences.
636 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
637 // U+1100xx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
639 ConvertUTFResultContainer(sourceIllegal)
640 .withScalars(0xfffd, 0xfffd, 0xfffd),
641 "\xf4\x90\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000642 // U+13FBxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
644 ConvertUTFResultContainer(sourceIllegal)
645 .withScalars(0xfffd, 0xfffd, 0xfffd),
646 "\xf4\xbf\xbf"));
647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
648 ConvertUTFResultContainer(sourceIllegal)
649 .withScalars(0xfffd, 0xfffd, 0xfffd),
650 "\xf5\x80\x80"));
651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
652 ConvertUTFResultContainer(sourceIllegal)
653 .withScalars(0xfffd, 0xfffd, 0xfffd),
654 "\xf6\x80\x80"));
655 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
656 ConvertUTFResultContainer(sourceIllegal)
657 .withScalars(0xfffd, 0xfffd, 0xfffd),
658 "\xf7\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000659 // U+1FFBxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000660 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
661 ConvertUTFResultContainer(sourceIllegal)
662 .withScalars(0xfffd, 0xfffd, 0xfffd),
663 "\xf7\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000664
665 // Ill-formed 5-byte sequences.
666 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
667 // U+2000xx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
669 ConvertUTFResultContainer(sourceIllegal)
670 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
671 "\xf8\x88\x80\x80"));
672 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
673 ConvertUTFResultContainer(sourceIllegal)
674 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
675 "\xf8\xbf\xbf\xbf"));
676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
677 ConvertUTFResultContainer(sourceIllegal)
678 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
679 "\xf9\x80\x80\x80"));
680 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
681 ConvertUTFResultContainer(sourceIllegal)
682 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
683 "\xfa\x80\x80\x80"));
684 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
685 ConvertUTFResultContainer(sourceIllegal)
686 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
687 "\xfb\x80\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000688 // U+3FFFFxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000689 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
690 ConvertUTFResultContainer(sourceIllegal)
691 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
692 "\xfb\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000693
694 // Ill-formed 6-byte sequences.
695 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
696 // U+40000xx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
698 ConvertUTFResultContainer(sourceIllegal)
699 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
700 "\xfc\x84\x80\x80\x80"));
701 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
702 ConvertUTFResultContainer(sourceIllegal)
703 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
704 "\xfc\xbf\xbf\xbf\xbf"));
705 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
706 ConvertUTFResultContainer(sourceIllegal)
707 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
708 "\xfd\x80\x80\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000709 // U+7FFFFFxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
711 ConvertUTFResultContainer(sourceIllegal)
712 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
713 "\xfd\xbf\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000714
715 //
716 // Sequences with two continuation bytes missing
717 //
718
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
721 "\xf0\x90"));
722 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
723 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
724 "\xf0\xbf"));
725 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
726 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
727 "\xf1\x80"));
728 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
729 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
730 "\xf3\xbf"));
731 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
732 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
733 "\xf4\x80"));
734 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
735 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
736 "\xf4\x8f"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000737
738 // Overlong sequences with two trailing byte missing.
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000739 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
740 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
741 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
742 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
743 "\xf0\x80"));
744 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
745 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
746 "\xf0\x8f"));
747 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
748 ConvertUTFResultContainer(sourceIllegal)
749 .withScalars(0xfffd, 0xfffd, 0xfffd),
750 "\xf8\x80\x80"));
751 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
752 ConvertUTFResultContainer(sourceIllegal)
753 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
754 "\xfc\x80\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000755
756 // Sequences that represent surrogates with two trailing bytes missing.
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000757 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
758 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000759
760 // Ill-formed 4-byte sequences.
761 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
762 // U+110yxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000763 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
764 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
765 "\xf4\x90"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000766 // U+13Fyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000767 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
768 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
769 "\xf4\xbf"));
770 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
771 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
772 "\xf5\x80"));
773 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
774 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
775 "\xf6\x80"));
776 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
777 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
778 "\xf7\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000779 // U+1FFyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000780 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
781 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
782 "\xf7\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000783
784 // Ill-formed 5-byte sequences.
785 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
786 // U+200yxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000787 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
788 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
789 "\xf8\x88\x80"));
790 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
791 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
792 "\xf8\xbf\xbf"));
793 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
794 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
795 "\xf9\x80\x80"));
796 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
797 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
798 "\xfa\x80\x80"));
799 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
800 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
801 "\xfb\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000802 // U+3FFFyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000803 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
804 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
805 "\xfb\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000806
807 // Ill-formed 6-byte sequences.
808 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
809 // U+4000yxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000810 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
812 "\xfc\x84\x80\x80"));
813 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
815 "\xfc\xbf\xbf\xbf"));
816 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
817 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
818 "\xfd\x80\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000819 // U+7FFFFyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000820 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
821 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
822 "\xfd\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000823
824 //
825 // Sequences with three continuation bytes missing
826 //
827
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000828 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
829 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
830 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
831 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
832 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
833 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
834 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
836 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
837 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000838
839 // Broken overlong sequences.
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000840 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
841 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
842 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
844 "\xf8\x80"));
845 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
846 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
847 "\xfc\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000848
849 // Ill-formed 4-byte sequences.
850 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
851 // U+14yyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000852 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
853 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
854 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
855 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000856 // U+1Cyyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000857 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
858 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000859
860 // Ill-formed 5-byte sequences.
861 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
862 // U+20yyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000863 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
864 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
865 "\xf8\x88"));
866 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
867 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
868 "\xf8\xbf"));
869 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
870 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
871 "\xf9\x80"));
872 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
873 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
874 "\xfa\x80"));
875 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
876 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
877 "\xfb\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000878 // U+3FCyyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000879 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
880 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
881 "\xfb\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000882
883 // Ill-formed 6-byte sequences.
884 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
885 // U+400yyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000886 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
887 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
888 "\xfc\x84\x80"));
889 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
890 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
891 "\xfc\xbf\xbf"));
892 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
893 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
894 "\xfd\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000895 // U+7FFCyyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000896 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
897 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
898 "\xfd\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000899
900 //
901 // Sequences with four continuation bytes missing
902 //
903
904 // Ill-formed 5-byte sequences.
905 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
906 // U+uzyyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000907 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
908 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
909 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
910 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
911 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
912 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
913 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
914 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000915 // U+3zyyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000916 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
917 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000918
919 // Broken overlong sequences.
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000920 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
921 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
922 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
923 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
924 "\xfc\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000925
926 // Ill-formed 6-byte sequences.
927 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
928 // U+uzzyyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000929 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
930 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
931 "\xfc\x84"));
932 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
933 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
934 "\xfc\xbf"));
935 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
936 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
937 "\xfd\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000938 // U+7Fzzyyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000939 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
940 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
941 "\xfd\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000942
943 //
944 // Sequences with five continuation bytes missing
945 //
946
947 // Ill-formed 6-byte sequences.
948 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
949 // U+uzzyyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000950 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
951 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000952 // U+uuzzyyxx (invalid)
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000953 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
954 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000955
956 //
957 // Consecutive sequences with trailing bytes missing
958 //
959
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000960 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
961 ConvertUTFResultContainer(sourceIllegal)
962 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
963 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
964 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
965 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
966 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
967 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
968 "\xc0" "\xe0\x80" "\xf0\x80\x80"
969 "\xf8\x80\x80\x80"
970 "\xfc\x80\x80\x80\x80"
971 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
972 "\xfb\xbf\xbf\xbf"
973 "\xfd\xbf\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000974
975 //
976 // Overlong UTF-8 sequences
977 //
978
979 // U+002F SOLIDUS
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000980 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
981 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +0000982
983 // Overlong sequences of the above.
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +0000984 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
985 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
986 "\xc0\xaf"));
987 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
988 ConvertUTFResultContainer(sourceIllegal)
989 .withScalars(0xfffd, 0xfffd, 0xfffd),
990 "\xe0\x80\xaf"));
991 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
992 ConvertUTFResultContainer(sourceIllegal)
993 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
994 "\xf0\x80\x80\xaf"));
995 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
996 ConvertUTFResultContainer(sourceIllegal)
997 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
998 "\xf8\x80\x80\x80\xaf"));
999 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1000 ConvertUTFResultContainer(sourceIllegal)
1001 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1002 "\xfc\x80\x80\x80\x80\xaf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001003
1004 // U+0000 NULL
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001005 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1006 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1007 StringRef("\x00", 1)));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001008
1009 // Overlong sequences of the above.
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001010 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1011 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1012 "\xc0\x80"));
1013 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1014 ConvertUTFResultContainer(sourceIllegal)
1015 .withScalars(0xfffd, 0xfffd, 0xfffd),
1016 "\xe0\x80\x80"));
1017 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1018 ConvertUTFResultContainer(sourceIllegal)
1019 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1020 "\xf0\x80\x80\x80"));
1021 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1022 ConvertUTFResultContainer(sourceIllegal)
1023 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1024 "\xf8\x80\x80\x80\x80"));
1025 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1026 ConvertUTFResultContainer(sourceIllegal)
1027 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1028 "\xfc\x80\x80\x80\x80\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001029
1030 // Other overlong sequences.
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001031 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1033 "\xc0\xbf"));
1034 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1035 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1036 "\xc1\x80"));
1037 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1038 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1039 "\xc1\xbf"));
1040 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1041 ConvertUTFResultContainer(sourceIllegal)
1042 .withScalars(0xfffd, 0xfffd, 0xfffd),
1043 "\xe0\x9f\xbf"));
1044 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1045 ConvertUTFResultContainer(sourceIllegal)
1046 .withScalars(0xfffd, 0xfffd, 0xfffd),
1047 "\xed\xa0\x80"));
1048 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1049 ConvertUTFResultContainer(sourceIllegal)
1050 .withScalars(0xfffd, 0xfffd, 0xfffd),
1051 "\xed\xbf\xbf"));
1052 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1053 ConvertUTFResultContainer(sourceIllegal)
1054 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1055 "\xf0\x8f\x80\x80"));
1056 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1057 ConvertUTFResultContainer(sourceIllegal)
1058 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1059 "\xf0\x8f\xbf\xbf"));
1060 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1061 ConvertUTFResultContainer(sourceIllegal)
1062 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1063 "\xf8\x87\xbf\xbf\xbf"));
1064 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1065 ConvertUTFResultContainer(sourceIllegal)
1066 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1067 "\xfc\x83\xbf\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001068
1069 //
1070 // Isolated surrogates
1071 //
1072
1073 // Unicode 6.3.0:
1074 //
1075 // D71. High-surrogate code point: A Unicode code point in the range
1076 // U+D800 to U+DBFF.
1077 //
1078 // D73. Low-surrogate code point: A Unicode code point in the range
1079 // U+DC00 to U+DFFF.
1080
1081 // Note: U+E0100 is <DB40 DD00> in UTF16.
1082
1083 // High surrogates
1084
1085 // U+D800
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001086 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1087 ConvertUTFResultContainer(sourceIllegal)
1088 .withScalars(0xfffd, 0xfffd, 0xfffd),
1089 "\xed\xa0\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001090
1091 // U+DB40
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001092 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1093 ConvertUTFResultContainer(sourceIllegal)
1094 .withScalars(0xfffd, 0xfffd, 0xfffd),
1095 "\xed\xac\xa0"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001096
1097 // U+DBFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001098 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1099 ConvertUTFResultContainer(sourceIllegal)
1100 .withScalars(0xfffd, 0xfffd, 0xfffd),
1101 "\xed\xaf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001102
1103 // Low surrogates
1104
1105 // U+DC00
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001106 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107 ConvertUTFResultContainer(sourceIllegal)
1108 .withScalars(0xfffd, 0xfffd, 0xfffd),
1109 "\xed\xb0\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001110
1111 // U+DD00
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001112 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113 ConvertUTFResultContainer(sourceIllegal)
1114 .withScalars(0xfffd, 0xfffd, 0xfffd),
1115 "\xed\xb4\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001116
1117 // U+DFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001118 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1119 ConvertUTFResultContainer(sourceIllegal)
1120 .withScalars(0xfffd, 0xfffd, 0xfffd),
1121 "\xed\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001122
1123 // Surrogate pairs
1124
1125 // U+D800 U+DC00
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001126 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1127 ConvertUTFResultContainer(sourceIllegal)
1128 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1129 "\xed\xa0\x80\xed\xb0\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001130
1131 // U+D800 U+DD00
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001132 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133 ConvertUTFResultContainer(sourceIllegal)
1134 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1135 "\xed\xa0\x80\xed\xb4\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001136
1137 // U+D800 U+DFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001138 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1139 ConvertUTFResultContainer(sourceIllegal)
1140 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1141 "\xed\xa0\x80\xed\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001142
1143 // U+DB40 U+DC00
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001144 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1145 ConvertUTFResultContainer(sourceIllegal)
1146 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1147 "\xed\xac\xa0\xed\xb0\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001148
1149 // U+DB40 U+DD00
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001150 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1151 ConvertUTFResultContainer(sourceIllegal)
1152 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1153 "\xed\xac\xa0\xed\xb4\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001154
1155 // U+DB40 U+DFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001156 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1157 ConvertUTFResultContainer(sourceIllegal)
1158 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1159 "\xed\xac\xa0\xed\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001160
1161 // U+DBFF U+DC00
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001162 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1163 ConvertUTFResultContainer(sourceIllegal)
1164 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1165 "\xed\xaf\xbf\xed\xb0\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001166
1167 // U+DBFF U+DD00
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001168 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1169 ConvertUTFResultContainer(sourceIllegal)
1170 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1171 "\xed\xaf\xbf\xed\xb4\x80"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001172
1173 // U+DBFF U+DFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001174 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1175 ConvertUTFResultContainer(sourceIllegal)
1176 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1177 "\xed\xaf\xbf\xed\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001178
1179 //
1180 // Noncharacters
1181 //
1182
1183 // Unicode 6.3.0:
1184 //
1185 // D14. Noncharacter: A code point that is permanently reserved for
1186 // internal use and that should never be interchanged. Noncharacters
1187 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1188 // and the values U+FDD0..U+FDEF.
1189
1190 // U+FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001191 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1192 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1193 "\xef\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001194
1195 // U+FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001196 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1197 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1198 "\xef\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001199
1200 // U+1FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001201 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1202 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1203 "\xf0\x9f\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001204
1205 // U+1FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001206 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1207 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1208 "\xf0\x9f\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001209
1210 // U+2FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001211 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1212 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1213 "\xf0\xaf\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001214
1215 // U+2FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001216 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1217 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1218 "\xf0\xaf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001219
1220 // U+3FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001221 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1222 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1223 "\xf0\xbf\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001224
1225 // U+3FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001226 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1227 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1228 "\xf0\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001229
1230 // U+4FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001231 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1232 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1233 "\xf1\x8f\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001234
1235 // U+4FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001236 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1237 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1238 "\xf1\x8f\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001239
1240 // U+5FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001241 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1242 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1243 "\xf1\x9f\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001244
1245 // U+5FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1247 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1248 "\xf1\x9f\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001249
1250 // U+6FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001251 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1252 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1253 "\xf1\xaf\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001254
1255 // U+6FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001256 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1257 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1258 "\xf1\xaf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001259
1260 // U+7FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001261 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1262 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1263 "\xf1\xbf\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001264
1265 // U+7FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001266 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1267 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1268 "\xf1\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001269
1270 // U+8FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001271 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1272 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1273 "\xf2\x8f\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001274
1275 // U+8FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001276 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1277 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1278 "\xf2\x8f\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001279
1280 // U+9FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001281 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1282 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1283 "\xf2\x9f\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001284
1285 // U+9FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001286 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1287 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1288 "\xf2\x9f\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001289
1290 // U+AFFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001291 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1292 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1293 "\xf2\xaf\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001294
1295 // U+AFFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001296 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1297 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1298 "\xf2\xaf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001299
1300 // U+BFFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001301 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1302 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1303 "\xf2\xbf\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001304
1305 // U+BFFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001306 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1307 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1308 "\xf2\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001309
1310 // U+CFFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001311 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1312 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1313 "\xf3\x8f\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001314
1315 // U+CFFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001316 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1317 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1318 "\xf3\x8f\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001319
1320 // U+DFFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001321 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1322 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1323 "\xf3\x9f\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001324
1325 // U+DFFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001326 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1327 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1328 "\xf3\x9f\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001329
1330 // U+EFFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001331 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1332 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1333 "\xf3\xaf\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001334
1335 // U+EFFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001336 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1337 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1338 "\xf3\xaf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001339
1340 // U+FFFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001341 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1342 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1343 "\xf3\xbf\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001344
1345 // U+FFFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001346 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1347 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1348 "\xf3\xbf\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001349
1350 // U+10FFFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001351 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1352 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1353 "\xf4\x8f\xbf\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001354
1355 // U+10FFFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001356 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1357 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1358 "\xf4\x8f\xbf\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001359
1360 // U+FDD0
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001361 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1362 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1363 "\xef\xb7\x90"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001364
1365 // U+FDD1
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001366 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1367 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1368 "\xef\xb7\x91"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001369
1370 // U+FDD2
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001371 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1372 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1373 "\xef\xb7\x92"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001374
1375 // U+FDD3
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001376 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1377 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1378 "\xef\xb7\x93"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001379
1380 // U+FDD4
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001381 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1382 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1383 "\xef\xb7\x94"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001384
1385 // U+FDD5
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001386 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1387 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1388 "\xef\xb7\x95"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001389
1390 // U+FDD6
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001391 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1392 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1393 "\xef\xb7\x96"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001394
1395 // U+FDD7
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001396 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1397 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1398 "\xef\xb7\x97"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001399
1400 // U+FDD8
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001401 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1402 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1403 "\xef\xb7\x98"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001404
1405 // U+FDD9
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001406 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1407 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1408 "\xef\xb7\x99"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001409
1410 // U+FDDA
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1412 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1413 "\xef\xb7\x9a"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001414
1415 // U+FDDB
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001416 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1417 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1418 "\xef\xb7\x9b"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001419
1420 // U+FDDC
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001421 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1422 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1423 "\xef\xb7\x9c"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001424
1425 // U+FDDD
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001426 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1427 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1428 "\xef\xb7\x9d"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001429
1430 // U+FDDE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001431 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1432 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1433 "\xef\xb7\x9e"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001434
1435 // U+FDDF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001436 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1437 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1438 "\xef\xb7\x9f"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001439
1440 // U+FDE0
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001441 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1442 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1443 "\xef\xb7\xa0"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001444
1445 // U+FDE1
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001446 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1447 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1448 "\xef\xb7\xa1"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001449
1450 // U+FDE2
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001451 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1452 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1453 "\xef\xb7\xa2"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001454
1455 // U+FDE3
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001456 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1457 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1458 "\xef\xb7\xa3"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001459
1460 // U+FDE4
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001461 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1462 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1463 "\xef\xb7\xa4"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001464
1465 // U+FDE5
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001466 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1467 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1468 "\xef\xb7\xa5"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001469
1470 // U+FDE6
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001471 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1472 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1473 "\xef\xb7\xa6"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001474
1475 // U+FDE7
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001476 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1477 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1478 "\xef\xb7\xa7"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001479
1480 // U+FDE8
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001481 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1482 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1483 "\xef\xb7\xa8"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001484
1485 // U+FDE9
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001486 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1487 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1488 "\xef\xb7\xa9"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001489
1490 // U+FDEA
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001491 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1492 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1493 "\xef\xb7\xaa"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001494
1495 // U+FDEB
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001496 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1497 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1498 "\xef\xb7\xab"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001499
1500 // U+FDEC
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001501 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1502 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1503 "\xef\xb7\xac"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001504
1505 // U+FDED
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001506 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1507 ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1508 "\xef\xb7\xad"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001509
1510 // U+FDEE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001511 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1512 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1513 "\xef\xb7\xae"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001514
1515 // U+FDEF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001516 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1517 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1518 "\xef\xb7\xaf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001519
1520 // U+FDF0
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001521 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1522 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1523 "\xef\xb7\xb0"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001524
1525 // U+FDF1
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001526 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1527 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1528 "\xef\xb7\xb1"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001529
1530 // U+FDF2
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001531 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1532 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1533 "\xef\xb7\xb2"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001534
1535 // U+FDF3
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001536 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1537 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1538 "\xef\xb7\xb3"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001539
1540 // U+FDF4
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001541 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1542 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1543 "\xef\xb7\xb4"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001544
1545 // U+FDF5
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001546 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1547 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1548 "\xef\xb7\xb5"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001549
1550 // U+FDF6
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001551 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1552 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1553 "\xef\xb7\xb6"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001554
1555 // U+FDF7
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001556 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1557 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1558 "\xef\xb7\xb7"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001559
1560 // U+FDF8
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001561 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1562 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1563 "\xef\xb7\xb8"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001564
1565 // U+FDF9
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001566 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1567 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1568 "\xef\xb7\xb9"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001569
1570 // U+FDFA
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001571 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1572 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1573 "\xef\xb7\xba"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001574
1575 // U+FDFB
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001576 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1577 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1578 "\xef\xb7\xbb"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001579
1580 // U+FDFC
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001581 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1582 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1583 "\xef\xb7\xbc"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001584
1585 // U+FDFD
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001586 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1587 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1588 "\xef\xb7\xbd"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001589
1590 // U+FDFE
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001591 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1592 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1593 "\xef\xb7\xbe"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001594
1595 // U+FDFF
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001596 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1597 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1598 "\xef\xb7\xbf"));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001599}
1600
1601TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1602 // U+0041 LATIN CAPITAL LETTER A
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1604 ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1605 "\x41", true));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001606
1607 //
1608 // Sequences with one continuation byte missing
1609 //
1610
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001611 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1612 ConvertUTFResultContainer(sourceExhausted),
1613 "\xc2", true));
1614 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1615 ConvertUTFResultContainer(sourceExhausted),
1616 "\xdf", true));
1617 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1618 ConvertUTFResultContainer(sourceExhausted),
1619 "\xe0\xa0", true));
1620 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1621 ConvertUTFResultContainer(sourceExhausted),
1622 "\xe0\xbf", true));
1623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1624 ConvertUTFResultContainer(sourceExhausted),
1625 "\xe1\x80", true));
1626 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1627 ConvertUTFResultContainer(sourceExhausted),
1628 "\xec\xbf", true));
1629 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1630 ConvertUTFResultContainer(sourceExhausted),
1631 "\xed\x80", true));
1632 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1633 ConvertUTFResultContainer(sourceExhausted),
1634 "\xed\x9f", true));
1635 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1636 ConvertUTFResultContainer(sourceExhausted),
1637 "\xee\x80", true));
1638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1639 ConvertUTFResultContainer(sourceExhausted),
1640 "\xef\xbf", true));
1641 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1642 ConvertUTFResultContainer(sourceExhausted),
1643 "\xf0\x90\x80", true));
1644 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1645 ConvertUTFResultContainer(sourceExhausted),
1646 "\xf0\xbf\xbf", true));
1647 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1648 ConvertUTFResultContainer(sourceExhausted),
1649 "\xf1\x80\x80", true));
1650 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1651 ConvertUTFResultContainer(sourceExhausted),
1652 "\xf3\xbf\xbf", true));
1653 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1654 ConvertUTFResultContainer(sourceExhausted),
1655 "\xf4\x80\x80", true));
1656 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1657 ConvertUTFResultContainer(sourceExhausted),
1658 "\xf4\x8f\xbf", true));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001659
Dmitri Gribenkoebdd0a52014-06-17 09:33:24 +00001660 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1661 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1662 "\x41\xc2", true));
Dmitri Gribenko1089db02014-06-16 11:09:46 +00001663}
1664