blob: c3b0b3ac3b9012770c8d79f56308ceb6909cecc7 [file] [log] [blame]
/*
* Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
* This source code is provided as is by Unicode, Inc. No claims are
* made as to fitness for any particular purpose. No warranties of any
* kind are expressed or implied. The recipient agrees to determine
* applicability of information provided. If this file has been
* purchased on magnetic or optical media from Unicode, Inc., the
* sole remedy for any claim will be exchange of defective media
* within 90 days of receipt.
*
* Limitations on Rights to Redistribute This Code
*
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form
* for internal or external distribution as long as this notice
* remains attached.
*
* harness.c
*
* This is a test harness for "ConvertUTF.c". Compile this
* and run without arguments. It will exhaustively test
* the conversion routines, and print a few lines of diagnostic
* output. You don't need to compile ConvertUTF.c itself,
* since it gets #included here along with the header.
* Example of a compile line:
*
* $ gcc -g harness.c -o harness
*
* Rev History: Rick McGowan, new file April 2001.
* Sept 19, 2002: Corrected error on line 234: utf16_buf[2] becomes utf16_result[2]
* per report from Iain Murray.
* July 3, 2003: Updated printout message.
* Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch
* illegal surrogate use in UTF-8, per report from Frank Tang.
*
*/
#define CVTUTF_DEBUG 1
#include <stdio.h>
#include "ConvertUTF.c"
/* ---------------------------------------------------------------------
test01 - Spot check a few legal & illegal UTF-8 values only.
This is not an exhaustive test, just a brief one that was
used to develop the "isLegalUTF8" routine.
Legal UTF-8 sequences are:
1st---- 2nd---- 3rd---- 4th---- Codepoints---
00-7F 0000- 007F
C2-DF 80-BF 0080- 07FF
E0 A0-BF 80-BF 0800- 0FFF
E1-EC 80-BF 80-BF 1000- CFFF
ED 80-9F 80-BF D000- D7FF
EE-EF 80-BF 80-BF E000- FFFF
F0 90-BF 80-BF 80-BF 10000- 3FFFF
F1-F3 80-BF 80-BF 80-BF 40000- FFFFF
F4 80-8F 80-BF 80-BF 100000-10FFFF
--------------------------------------------------------------------- */
struct utf8_test {
Boolean utf8_legal; /* is legal sequence? */
int utf8_len; /* length of sequence */
unsigned char utf8_seq[5]; /* the sequence */
};
struct utf8_test utf8_testData[] = {
{ 1, 1, { 0x7A, 0x00, 0x00, 0x00, 0x00 }}, /* 0 */
{ 1, 2, { 0xC2, 0xAC, 0x00, 0x00, 0x00 }}, /* 1 */
{ 1, 2, { 0xDF, 0xB2, 0x00, 0x00, 0x00 }}, /* 2 */
{ 1, 3, { 0xE0, 0xA1, 0x81, 0x00, 0x00 }}, /* 3 */
{ 1, 3, { 0xE1, 0xAC, 0x90, 0x00, 0x00 }}, /* 4 */
{ 1, 3, { 0xF0, 0x93, 0xB2, 0xA1, 0x00 }}, /* 5 */
{ 1, 4, { 0xF1, 0x87, 0x9A, 0xB0, 0x00 }}, /* 6 */
{ 1, 4, { 0xF3, 0x88, 0x9B, 0xAD, 0x00 }}, /* 7 */
{ 1, 4, { 0xF4, 0x82, 0x89, 0x8F, 0x00 }}, /* 8 */
{ 0, 3, { 0x82, 0x00, 0x00, 0x00, 0x00 }}, /* 9 */
{ 0, 2, { 0xF8, 0xAC, 0x00, 0x00, 0x00 }}, /* 10 */
{ 0, 2, { 0xE1, 0xFC, 0xFF, 0x00, 0x00 }}, /* 11 */
{ 0, 3, { 0xC2, 0xFC, 0x00, 0x00, 0x00 }}, /* 12 */
{ 0, 3, { 0xE1, 0xC2, 0x81, 0x00, 0x00 }}, /* 13 */
{ 0, 2, { 0xC2, 0xC1, 0x00, 0x00, 0x00 }}, /* 14 */
{ 0, 2, { 0xC0, 0xAF, 0x00, 0x00, 0x00 }}, /* 15 */
{ 0, 3, { 0xE0, 0x9F, 0x80, 0x00, 0x00 }}, /* 16 */
{ 0, 4, { 0xF0, 0x93, 0xB2, 0xC1, 0x00 }}, /* 17 */
{ 1, 3, { 0xED, 0x9F, 0xBF, 0x00, 0x00 }}, /* 18 */
{ 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* 19 */
{ 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* 20 */
{ 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* 21 */
/* for all > 21 use "short" buffer lengths to detect over-run */
{ 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */
{ 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }},
};
int test01() {
int i;
int rval, wantVal1, wantVal2, gotVal1, gotVal2, len2;
printf("Begin Test01\n"); fflush(stdout);
rval = 0;
for (i = 0; utf8_testData[i].utf8_len; i++) {
wantVal1 = wantVal2 = utf8_testData[i].utf8_legal;
gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len);
/* use truncated length for tests over 21 */
if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2);
if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) {
printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n",
i, gotVal1, gotVal2, wantVal1, wantVal2, utf8_testData[i].utf8_seq[0],
utf8_testData[i].utf8_seq[1], utf8_testData[i].utf8_seq[2],
utf8_testData[i].utf8_seq[3], utf8_testData[i].utf8_seq[4],
utf8_testData[i].utf8_len);
++rval;
}
}
return (rval ? 0 : 1);
}
/* ---------------------------------------------------------------------
test02 - Test round trip UTF32 -> UTF16 -> UTF8 -> UTF16 -> UTF32
This is an exhaustive test of values 0 through 0x10FFFF. It
takes each integer value and converts from UTC4 through the
other encoding forms, and back to UTR32, checking the results
along the way.
It does not check the un-paired low surrogates, except for
the first low surrogate. It intends to get that one illegal
result, prints a message, and continues with tests.
--------------------------------------------------------------------- */
int test02() {
int i, n;
ConversionResult result;
UTF32 utf32_buf[2], utf32_result[2];
UTF16 utf16_buf[3], utf16_result[3];
UTF8 utf8_buf[8];
UTF32 *utf32SourceStart, *utf32TargetStart;
UTF16 *utf16SourceStart, *utf16TargetStart;
UTF8 *utf8SourceStart, *utf8TargetStart;
printf("Begin Test02\n"); fflush(stdout);
for (i = 0; i <= 0x10FFFF; i++) {
utf32_buf[0] = i; utf32_buf[1] = 0;
utf32_result[0] = utf32_result[1] = 0;
utf16_buf[0] = utf16_buf[1] = utf16_buf[2] = 0;
utf16_result[0] = utf16_result[1] = utf16_result[2] = 0;
for (n = 0; n < 8; n++) utf8_buf[n] = 0;
utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
utf16TargetStart = utf16SourceStart = utf16_buf;
utf8TargetStart = utf8SourceStart = utf8_buf;
/*
* Test UTF32 -> UTF16
*/
result = ConvertUTF32toUTF16((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
if (i < UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) {
/* skip result checking for all but 0000d800, which we know to be illegal */
switch (result) {
default: fprintf(stderr, "Test02A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
case sourceExhausted: printf("sourceExhausted\t"); break;
case targetExhausted: printf("targetExhausted\t"); break;
case sourceIllegal: printf("sourceIllegal\t"); break;
}
}
if (result != conversionOK) {
if (i <= UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) {
printf("Test02A for %d, input %08x, output %04x,%04x, result %d\n",
i, utf32_buf[0], utf16_buf[0], utf16_buf[1], result);
if ((i != UNI_SUR_HIGH_START) || (result != sourceIllegal)) {
return 0;
} else {
printf("!!! Test02A: note expected illegal result for 0x0000D800\n");
}
}
}
if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue;
/*
* Test UTF16 -> UTF8, with legality check on. We check for everything except
* for unpaired low surrogates. We do make one check that the lowest low
* surrogate, when unpaired, is illegal.
*/
result = ConvertUTF16toUTF8((const UTF16 **) &utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion);
switch (result) {
default: fprintf(stderr, "Test02B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
case sourceExhausted: printf("sourceExhausted\t"); break;
case targetExhausted: printf("targetExhausted\t"); break;
case sourceIllegal: printf("sourceIllegal\t"); break;
}
if (result != conversionOK) {
printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n",
i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result);
if ((i != UNI_SUR_LOW_START) && (i != UNI_SUR_HIGH_START)) {
return 0;
} else {
/* Note: This illegal result only happens if we remove the surrogate
check in Test02A. So it shouldn't be seen unless that check and
the "continue" are removed in the test above.
*/
if (i == UNI_SUR_LOW_START)
printf("!!! Test02B: note expected illegal result for 0xDC00,0000\n");
else if (i == UNI_SUR_HIGH_START)
printf("!!! Test02B: note expected illegal result for 0xD800,0000\n");
}
}
if ((i == UNI_SUR_LOW_START) && result != sourceIllegal) {
printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n",
i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result);
printf("Test02B: expected illegal result for 0xDC00,0000 was not flagged illegal.\n");
return 0;
}
if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue;
/*
* Reset some result buffer pointers for the trip back.
*/
utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
utf16TargetStart = utf16SourceStart = utf16_result;
utf8TargetStart = utf8SourceStart = utf8_buf;
/*
* Test UTF8 -> UTF16, with legality check on.
*/
result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_result[2]), strictConversion);
switch (result) {
default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
case sourceExhausted: printf("sourceExhausted\t"); break;
case targetExhausted: printf("targetExhausted\t"); break;
case sourceIllegal: printf("sourceIllegal\t"); break;
}
if (result != conversionOK) {
printf("Test02C for %d (0x%x), input %s; output %04x,%04x; result %d\n",
i, utf32_buf[0], utf8_buf, utf16_buf[0], utf16_buf[1], result);
return 0;
}
for (n = 0; n < 3; n++) { /* check that the utf16 result is the same as what went in. */
if (utf16_buf[n] != utf16_result[n]) {
printf("Test02C error: input = 0x%08x; utf16_buf = 0x%04x,0x%04x; utf16_result = 0x%04x,0x%04x\n",
utf32_buf[0], utf16_buf[0], utf16_buf[1], utf16_result[0], utf16_result[1]);
return 0;
}
}
/*
* Test UTF16 -> UTF32, with legality check on. If the result of our previous
* conversion gave us a "surrogate pair", then we need to convert 2 entities
* back to UTF32.
*/
if (utf16_result[0] >= UNI_SUR_HIGH_START && utf16_result[0] <= UNI_SUR_HIGH_END) {
result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
} else {
result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
}
switch (result) {
default: fprintf(stderr, "Test02D fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
case sourceExhausted: printf("sourceExhausted\t"); break;
case targetExhausted: printf("targetExhausted\t"); break;
case sourceIllegal: printf("sourceIllegal\t"); break;
}
if (result != conversionOK) {
printf("Test02D for %d (0x%x), input %04x,%04x; output %08x; result %d\n",
i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf32_result[0], result);
return 0;
}
/*
* Now, check the final round-trip value.
*/
if (utf32_buf[0] != utf32_result[0]) {
printf("Test02E for %d: utf32 input %08x; trip output %08x (utf_16buf is %04x,%04x)\n", i, utf32_buf[0], utf32_result[0], utf16_buf[0], utf16_buf[1]);
return 0;
}
}
return 1;
}
/* ---------------------------------------------------------------------
test03 - Test round trip UTF32 -> UTF8 -> UTF32
This tests the functions that were not tested by test02 above.
For each UTF32 value 0 through 0x10FFFF, it tests the conversion
to UTF-8 and back. The test is exhaustive.
--------------------------------------------------------------------- */
int test03() {
int i, n;
ConversionResult result;
UTF32 utf32_buf[2], utf32_result[2];
UTF8 utf8_buf[8];
UTF32 *utf32SourceStart, *utf32TargetStart;
UTF8 *utf8SourceStart, *utf8TargetStart;
printf("Begin Test03\n"); fflush(stdout);
for (i = 0; i <= 0x10FFFF; i++) {
/* Skip all surrogates except UNI_SUR_HIGH_START, which we test for illegality. */
if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue;
utf32_buf[0] = i; utf32_buf[1] = 0;
utf32_result[0] = utf32_result[1] = 0;
for (n = 0; n < 8; n++) utf8_buf[n] = 0;
utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
utf8TargetStart = utf8SourceStart = utf8_buf;
/*
* Test UTF32 -> UTF8, with legality check on.
*/
result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
switch (result) {
default: fprintf(stderr, "Test03A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
case sourceExhausted: printf("sourceExhausted\t"); break;
case targetExhausted: printf("targetExhausted\t"); break;
case sourceIllegal: printf("sourceIllegal\t"); break;
}
if (result != conversionOK) {
printf("Test03A for %d (0x%x); output %s; result %d\n",
i, utf32_buf[0], utf8_buf, result);
if (i != UNI_SUR_HIGH_START) {
return 0;
} else {
printf("!!! Test03A: note expected illegal result for 0x0000D800\n");
}
}
if ((i == UNI_SUR_HIGH_START) && result != sourceIllegal) {
printf("Test03A for %d (0x%x); output %s; result %d\n",
i, utf32_buf[0], utf8_buf, result);
printf("Test03A: expected illegal result for 0x0000D800 was not flagged illegal.\n");
return 0;
}
if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue;
/*
* Reset some result buffer pointers for the trip back.
*/
utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
utf8TargetStart = utf8SourceStart = utf8_buf;
/*
* Test UTF8 -> UTF32, with legality check on.
*/
result = ConvertUTF8toUTF32((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
switch (result) {
default: fprintf(stderr, "Test03B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
case sourceExhausted: printf("sourceExhausted\t"); break;
case targetExhausted: printf("targetExhausted\t"); break;
case sourceIllegal: printf("sourceIllegal\t"); break;
}
if (result != conversionOK) {
printf("Test03B for %d (0x%x), input %s; output 0x%08x; result %d\n",
i, utf32_buf[0], utf8_buf, utf32_result[0], result);
return 0;
}
/*
* Now, check the final round-trip value.
*/
if (utf32_buf[0] != utf32_result[0]) {
printf("Test03C for %d: utf32 input %08x; utf8 buf %s; trip output %08x\n", i, utf32_buf[0], utf8_buf, utf32_result[0]);
return 0;
}
}
return 1;
}
/* ---------------------------------------------------------------------
test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8.
Expect it will be turned into UNI_REPLACEMENT_CHAR.
--------------------------------------------------------------------- */
int test04() {
int i, n;
ConversionResult result;
UTF32 utf32_buf[2];
UTF8 utf8_buf[8];
UTF32 *utf32SourceStart, *utf32TargetStart;
UTF8 *utf8SourceStart, *utf8TargetStart;
printf("Begin Test04\n"); fflush(stdout);
i = 0x10FFFF + 21; /* an arbitrary value > legal */
utf32_buf[0] = i; utf32_buf[1] = 0;
for (n = 0; n < 8; n++) utf8_buf[n] = 0;
utf32SourceStart = utf32_buf;
utf8TargetStart = utf8_buf;
/*
* Test UTF32 -> UTF8, with legality check on.
*/
result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
if (result != sourceIllegal) {
fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
}
return 1;
}
/* --------------------------------------------------------------------- */
int main() {
printf("Three tests of round-trip conversions will be performed.\n");
printf("One test of illegal UTF-32 will be peroformed.\n");
printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n");
printf("These are for tests of Surrogate conversion.\n\n");
fflush(stdout);
if (test01()) { printf("******** Test01 succeeded without error. ********\n\n"); }
else { printf("-------- Test01 failed. --------\n\n"); }
if (test02()) { printf("******** Test02 succeeded without error. ********\n\n"); }
else { printf("-------- Test02 failed. --------\n\n"); }
if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); }
else { printf("-------- Test03 failed. --------\n\n"); }
if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); }
else { printf("-------- Test04 failed. --------\n\n"); }
return 0;
}