Add unicode source. Change-Id: I5c44414dd3abe387599358e1e10134c36d03438c

commit: dc4699f0a7c0f7f76ce37935c58ffd1f638a0525 [log] [tgz]
author: Lucas Eckels <eckels@google.com> Mon Aug 06 15:22:01 2012 -0700
committer: Lucas Eckels <eckels@google.com> Wed Aug 08 09:29:35 2012 -0700
tree: c960f038b36d6ff74661afca5cb9f21df6e3410e
parent: 7f35fcc49bcdd6769ef6df775ca911655f3192e1 [diff] [blame]
diff --git a/harness.c b/harness.c
new file mode 100644
index 0000000..c3b0b3a
--- /dev/null
+++ b/harness.c

@@ -0,0 +1,446 @@
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ * 
+ * Disclaimer
+ * 
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ * 
+ * Limitations on Rights to Redistribute This Code
+ * 
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ *
+ * harness.c
+ *
+ * This is a test harness for "ConvertUTF.c".  Compile this
+ * and run without arguments.  It will exhaustively test
+ * the conversion routines, and print a few lines of diagnostic
+ * output.  You don't need to compile ConvertUTF.c itself, 
+ * since it gets #included here along with the header.
+ * Example of a compile line:
+ *
+ *		$	gcc -g harness.c -o harness
+ *
+ * Rev History: Rick McGowan, new file April 2001.
+ * Sept 19, 2002: Corrected error on line 234:  utf16_buf[2] becomes utf16_result[2]
+ * 	per report from Iain Murray.
+ * July 3, 2003: Updated printout message.
+ * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch
+ *	illegal surrogate use in UTF-8, per report from Frank Tang.
+ *
+ */
+
+#define CVTUTF_DEBUG 1
+
+#include <stdio.h>
+#include "ConvertUTF.c"
+
+/* ---------------------------------------------------------------------
+	test01 - Spot check a few legal & illegal UTF-8 values only.
+	This is not an exhaustive test, just a brief one that was
+	used to develop the "isLegalUTF8" routine.
+
+	Legal UTF-8 sequences are:
+
+	1st----	2nd----	3rd----	4th----	Codepoints---
+
+	00-7F				  0000-  007F
+	C2-DF	80-BF			  0080-  07FF
+	E0	A0-BF	80-BF		  0800-  0FFF
+	E1-EC   80-BF   80-BF             1000-  CFFF
+	ED      80-9F   80-BF             D000-  D7FF
+	EE-EF   80-BF   80-BF             E000-  FFFF
+	F0	90-BF	80-BF	80-BF	 10000- 3FFFF
+	F1-F3	80-BF	80-BF	80-BF	 40000- FFFFF
+	F4	80-8F	80-BF	80-BF	100000-10FFFF
+
+   --------------------------------------------------------------------- */
+
+
+struct utf8_test {
+    Boolean utf8_legal;	/* is legal sequence? */
+    int utf8_len;	/* length of sequence */
+    unsigned char utf8_seq[5];	/* the sequence */
+};
+
+struct utf8_test utf8_testData[] = {
+    { 1,	1,	{ 0x7A, 0x00, 0x00, 0x00, 0x00 }},	/* 0 */
+    { 1,	2,	{ 0xC2, 0xAC, 0x00, 0x00, 0x00 }},	/* 1 */
+    { 1,	2,	{ 0xDF, 0xB2, 0x00, 0x00, 0x00 }},	/* 2 */
+    { 1,	3,	{ 0xE0, 0xA1, 0x81, 0x00, 0x00 }},	/* 3 */
+    { 1,	3,	{ 0xE1, 0xAC, 0x90, 0x00, 0x00 }},	/* 4 */
+    { 1,	3,	{ 0xF0, 0x93, 0xB2, 0xA1, 0x00 }},	/* 5 */
+    { 1,	4,	{ 0xF1, 0x87, 0x9A, 0xB0, 0x00 }},	/* 6 */
+    { 1,	4,	{ 0xF3, 0x88, 0x9B, 0xAD, 0x00 }},	/* 7 */
+    { 1,	4,	{ 0xF4, 0x82, 0x89, 0x8F, 0x00 }},	/* 8 */
+
+    { 0,	3,	{ 0x82, 0x00, 0x00, 0x00, 0x00 }},	/* 9 */
+    { 0,	2,	{ 0xF8, 0xAC, 0x00, 0x00, 0x00 }},	/* 10 */
+    { 0,	2,	{ 0xE1, 0xFC, 0xFF, 0x00, 0x00 }},	/* 11 */
+    { 0,	3,	{ 0xC2, 0xFC, 0x00, 0x00, 0x00 }},	/* 12 */
+    { 0,	3,	{ 0xE1, 0xC2, 0x81, 0x00, 0x00 }},	/* 13 */
+    { 0,	2,	{ 0xC2, 0xC1, 0x00, 0x00, 0x00 }},	/* 14 */
+    { 0,	2,	{ 0xC0, 0xAF, 0x00, 0x00, 0x00 }},	/* 15 */
+    { 0,	3,	{ 0xE0, 0x9F, 0x80, 0x00, 0x00 }},	/* 16 */
+    { 0,	4,	{ 0xF0, 0x93, 0xB2, 0xC1, 0x00 }},	/* 17 */
+
+    { 1,	3,	{ 0xED, 0x9F, 0xBF, 0x00, 0x00 }},	/* 18 */
+    { 1,	3,	{ 0xEE, 0x80, 0x80, 0x00, 0x00 }},	/* 19 */
+    { 0,	3,	{ 0xED, 0xA0, 0x80, 0x00, 0x00 }},	/* 20 */
+    { 0,	3,	{ 0xED, 0xBF, 0xBF, 0x00, 0x00 }},	/* 21 */
+
+/* for all > 21 use "short" buffer lengths to detect over-run */
+    { 0,	4,	{ 0xF0, 0x93, 0xB2, 0xC3, 0x00 }},	/* 18 use short buflen */
+    { 0,	0,	{ 0x00, 0x00, 0x00, 0x00, 0x00 }},
+
+};
+
+int test01() {
+	int i;
+	int rval, wantVal1, wantVal2, gotVal1, gotVal2, len2;
+
+	printf("Begin Test01\n"); fflush(stdout);
+
+	rval = 0;
+	for (i = 0; utf8_testData[i].utf8_len; i++) {
+		wantVal1 = wantVal2 = utf8_testData[i].utf8_legal;
+		gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len);
+		/* use truncated length for tests over 21 */
+		if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
+		gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2);
+		if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) {
+			printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n",
+			i, gotVal1, gotVal2, wantVal1, wantVal2, utf8_testData[i].utf8_seq[0],
+			utf8_testData[i].utf8_seq[1], utf8_testData[i].utf8_seq[2],
+			utf8_testData[i].utf8_seq[3], utf8_testData[i].utf8_seq[4],
+			utf8_testData[i].utf8_len);
+			++rval;
+		}
+	}
+
+	return (rval ? 0 : 1);
+}
+
+
+/* ---------------------------------------------------------------------
+	test02 - Test round trip UTF32 -> UTF16 -> UTF8 -> UTF16 -> UTF32
+
+	This is an exhaustive test of values 0 through 0x10FFFF.  It
+	takes each integer value and converts from UTC4 through the
+	other encoding forms, and back to UTR32, checking the results
+	along the way.
+
+	It does not check the un-paired low surrogates, except for
+	the first low surrogate.  It intends to get that one illegal
+	result, prints a message, and continues with tests.
+
+   --------------------------------------------------------------------- */
+
+int test02() {
+	int i, n;
+	ConversionResult result;
+	UTF32 utf32_buf[2], utf32_result[2];
+	UTF16 utf16_buf[3], utf16_result[3];
+	UTF8 utf8_buf[8];
+	UTF32 *utf32SourceStart, *utf32TargetStart;
+	UTF16 *utf16SourceStart, *utf16TargetStart;
+	UTF8 *utf8SourceStart, *utf8TargetStart;
+
+	printf("Begin Test02\n"); fflush(stdout);
+
+	for (i = 0; i <= 0x10FFFF; i++) {
+		utf32_buf[0] = i; utf32_buf[1] = 0;
+		utf32_result[0] = utf32_result[1] = 0;
+		utf16_buf[0] = utf16_buf[1] = utf16_buf[2] = 0;
+		utf16_result[0] = utf16_result[1] = utf16_result[2] = 0;
+		for (n = 0; n < 8; n++) utf8_buf[n] = 0;
+
+		utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
+		utf16TargetStart = utf16SourceStart = utf16_buf;
+		utf8TargetStart = utf8SourceStart = utf8_buf;
+
+		/*
+		 * Test UTF32 -> UTF16
+		 */
+		result = ConvertUTF32toUTF16((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
+		if (i < UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) {
+			/* skip result checking for all but 0000d800, which we know to be illegal */
+			switch (result) {
+			default: fprintf(stderr, "Test02A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
+			case conversionOK: break;
+			case sourceExhausted: printf("sourceExhausted\t"); break;
+			case targetExhausted: printf("targetExhausted\t"); break;
+			case sourceIllegal: printf("sourceIllegal\t"); break;
+		    }
+		}
+		if (result != conversionOK) {
+			if (i <= UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) {
+				printf("Test02A for %d, input %08x, output %04x,%04x, result %d\n",
+				    i, utf32_buf[0], utf16_buf[0], utf16_buf[1], result);
+				if ((i != UNI_SUR_HIGH_START) || (result != sourceIllegal)) {
+					return 0;
+				} else {
+					printf("!!! Test02A: note expected illegal result for 0x0000D800\n");
+				}
+			}
+		}
+		if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue;
+
+		/*
+		 * Test UTF16 -> UTF8, with legality check on.  We check for everything except
+		 * for unpaired low surrogates.  We do make one check that the lowest low
+		 * surrogate, when unpaired, is illegal.
+		 */
+		result = ConvertUTF16toUTF8((const UTF16 **) &utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion);
+		switch (result) {
+		default: fprintf(stderr, "Test02B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
+		case conversionOK: break;
+		case sourceExhausted: printf("sourceExhausted\t"); break;
+		case targetExhausted: printf("targetExhausted\t"); break;
+		case sourceIllegal: printf("sourceIllegal\t"); break;
+		}
+		if (result != conversionOK) {
+			printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n",
+				i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result);
+			if ((i != UNI_SUR_LOW_START) && (i != UNI_SUR_HIGH_START)) {
+				return 0;
+			} else {
+				/* Note: This illegal result only happens if we remove the surrogate
+				    check in Test02A.  So it shouldn't be seen unless that check and
+				    the "continue" are removed in the test above.
+				*/
+				if (i == UNI_SUR_LOW_START)
+				    printf("!!! Test02B: note expected illegal result for 0xDC00,0000\n");
+				else if (i == UNI_SUR_HIGH_START)
+				    printf("!!! Test02B: note expected illegal result for 0xD800,0000\n");
+			}
+		}
+		if ((i == UNI_SUR_LOW_START) && result != sourceIllegal) {
+			printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n",
+				i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result);
+				printf("Test02B: expected illegal result for 0xDC00,0000 was not flagged illegal.\n");
+				return 0;
+		}
+
+		if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue;
+
+		/*
+		 * Reset some result buffer pointers for the trip back.
+		 */
+		utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
+		utf16TargetStart = utf16SourceStart = utf16_result;
+		utf8TargetStart = utf8SourceStart = utf8_buf;
+
+		/*
+		 * Test UTF8 -> UTF16, with legality check on.
+		 */
+		result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_result[2]), strictConversion);
+		switch (result) {
+		default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
+		case conversionOK: break;
+		case sourceExhausted: printf("sourceExhausted\t"); break;
+		case targetExhausted: printf("targetExhausted\t"); break;
+		case sourceIllegal: printf("sourceIllegal\t"); break;
+		}
+		if (result != conversionOK) {
+			printf("Test02C for %d (0x%x), input %s; output %04x,%04x; result %d\n",
+				i, utf32_buf[0], utf8_buf, utf16_buf[0], utf16_buf[1], result);
+			return 0;
+		}
+		for (n = 0; n < 3; n++) { /* check that the utf16 result is the same as what went in. */
+			if (utf16_buf[n] != utf16_result[n]) {
+				printf("Test02C error: input = 0x%08x; utf16_buf = 0x%04x,0x%04x; utf16_result = 0x%04x,0x%04x\n",
+					utf32_buf[0], utf16_buf[0], utf16_buf[1], utf16_result[0], utf16_result[1]);
+				return 0;
+			}
+		}
+
+		/*
+		 * Test UTF16 -> UTF32, with legality check on.  If the result of our previous
+		 * conversion gave us a "surrogate pair", then we need to convert 2 entities
+		 * back to UTF32.
+		 */
+		if (utf16_result[0] >= UNI_SUR_HIGH_START && utf16_result[0] <= UNI_SUR_HIGH_END) {
+			result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
+		} else {
+			result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
+		}
+		switch (result) {
+		default: fprintf(stderr, "Test02D fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
+		case conversionOK: break;
+		case sourceExhausted: printf("sourceExhausted\t"); break;
+		case targetExhausted: printf("targetExhausted\t"); break;
+		case sourceIllegal: printf("sourceIllegal\t"); break;
+		}
+		if (result != conversionOK) {
+			printf("Test02D for %d (0x%x), input %04x,%04x; output %08x; result %d\n",
+				i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf32_result[0], result);
+			return 0;
+		}
+
+		/*
+		 * Now, check the final round-trip value.
+		 */
+		if (utf32_buf[0] != utf32_result[0]) {
+			printf("Test02E for %d: utf32 input %08x; trip output %08x (utf_16buf is %04x,%04x)\n", i, utf32_buf[0], utf32_result[0], utf16_buf[0], utf16_buf[1]);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/* ---------------------------------------------------------------------
+	test03 - Test round trip UTF32 -> UTF8 -> UTF32
+
+	This tests the functions that were not tested by test02 above.
+	For each UTF32 value 0 through 0x10FFFF, it tests the conversion
+	to UTF-8 and back.  The test is exhaustive.
+
+   --------------------------------------------------------------------- */
+
+int test03() {
+	int i, n;
+	ConversionResult result;
+	UTF32 utf32_buf[2], utf32_result[2];
+	UTF8 utf8_buf[8];
+	UTF32 *utf32SourceStart, *utf32TargetStart;
+	UTF8 *utf8SourceStart, *utf8TargetStart;
+
+	printf("Begin Test03\n"); fflush(stdout);
+
+	for (i = 0; i <= 0x10FFFF; i++) {
+		/* Skip all surrogates except UNI_SUR_HIGH_START, which we test for illegality. */
+		if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue;
+
+		utf32_buf[0] = i; utf32_buf[1] = 0;
+		utf32_result[0] = utf32_result[1] = 0;
+		for (n = 0; n < 8; n++) utf8_buf[n] = 0;
+
+		utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
+		utf8TargetStart = utf8SourceStart = utf8_buf;
+
+		/*
+		 * Test UTF32 -> UTF8, with legality check on.
+		 */
+		result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
+		switch (result) {
+		default: fprintf(stderr, "Test03A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
+		case conversionOK: break;
+		case sourceExhausted: printf("sourceExhausted\t"); break;
+		case targetExhausted: printf("targetExhausted\t"); break;
+		case sourceIllegal: printf("sourceIllegal\t"); break;
+		}
+		if (result != conversionOK) {
+			printf("Test03A for %d (0x%x); output %s; result %d\n",
+				i, utf32_buf[0], utf8_buf, result);
+			if (i != UNI_SUR_HIGH_START) {
+				return 0;
+			} else {
+				printf("!!! Test03A: note expected illegal result for 0x0000D800\n");
+			}
+		}
+		if ((i == UNI_SUR_HIGH_START) && result != sourceIllegal) {
+			printf("Test03A for %d (0x%x); output %s; result %d\n",
+				i, utf32_buf[0], utf8_buf, result);
+				printf("Test03A: expected illegal result for 0x0000D800 was not flagged illegal.\n");
+				return 0;
+		}
+
+		if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue;
+
+		/*
+		 * Reset some result buffer pointers for the trip back.
+		 */
+		utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
+		utf8TargetStart = utf8SourceStart = utf8_buf;
+
+		/*
+		 * Test UTF8 -> UTF32, with legality check on.
+		 */
+		result = ConvertUTF8toUTF32((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
+		switch (result) {
+		default: fprintf(stderr, "Test03B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
+		case conversionOK: break;
+		case sourceExhausted: printf("sourceExhausted\t"); break;
+		case targetExhausted: printf("targetExhausted\t"); break;
+		case sourceIllegal: printf("sourceIllegal\t"); break;
+		}
+		if (result != conversionOK) {
+			printf("Test03B for %d (0x%x), input %s; output 0x%08x; result %d\n",
+				i, utf32_buf[0], utf8_buf, utf32_result[0], result);
+			return 0;
+		}
+
+		/*
+		 * Now, check the final round-trip value.
+		 */
+		if (utf32_buf[0] != utf32_result[0]) {
+			printf("Test03C for %d: utf32 input %08x; utf8 buf %s; trip output %08x\n", i, utf32_buf[0], utf8_buf, utf32_result[0]);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/* ---------------------------------------------------------------------
+	test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8.
+	Expect it will be turned into UNI_REPLACEMENT_CHAR.
+
+   --------------------------------------------------------------------- */
+
+int test04() {
+	int i, n;
+	ConversionResult result;
+	UTF32 utf32_buf[2];
+	UTF8 utf8_buf[8];
+	UTF32 *utf32SourceStart, *utf32TargetStart;
+	UTF8 *utf8SourceStart, *utf8TargetStart;
+
+	printf("Begin Test04\n"); fflush(stdout);
+
+	i = 0x10FFFF + 21; /* an arbitrary value > legal */
+
+	utf32_buf[0] = i; utf32_buf[1] = 0;
+	for (n = 0; n < 8; n++) utf8_buf[n] = 0;
+
+	utf32SourceStart = utf32_buf;
+	utf8TargetStart = utf8_buf;
+
+	/*
+	 * Test UTF32 -> UTF8, with legality check on.
+	 */
+	result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
+	if (result != sourceIllegal) {
+		fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
+	}
+
+	return 1;
+}
+
+/* --------------------------------------------------------------------- */
+
+int main() {
+	printf("Three tests of round-trip conversions will be performed.\n");
+	printf("One test of illegal UTF-32 will be peroformed.\n");
+	printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n");
+	printf("These are for tests of Surrogate conversion.\n\n");
+	fflush(stdout);
+	if (test01()) {	printf("******** Test01 succeeded without error. ********\n\n"); }
+	else { printf("-------- Test01 failed. --------\n\n"); }
+	if (test02()) { printf("******** Test02 succeeded without error. ********\n\n"); }
+	else { printf("-------- Test02 failed. --------\n\n"); }
+	if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); }
+	else { printf("-------- Test03 failed. --------\n\n"); }
+	if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); }
+	else { printf("-------- Test04 failed. --------\n\n"); }
+	return 0;
+}
commit	dc4699f0a7c0f7f76ce37935c58ffd1f638a0525	[log] [tgz]
author	Lucas Eckels <eckels@google.com>	Mon Aug 06 15:22:01 2012 -0700
committer	Lucas Eckels <eckels@google.com>	Wed Aug 08 09:29:35 2012 -0700
tree	c960f038b36d6ff74661afca5cb9f21df6e3410e
parent	7f35fcc49bcdd6769ef6df775ca911655f3192e1 [diff] [blame]