Blame - ConvertUTF.c - platform/external/unicode

blob: 9e836fa9bc6412e70768fef55c6b0f8b44be4112 [file] [log] [blame]

Lucas Eckels	dc4699f	2012-08-06 15:22:01 -0700	[diff] [blame]	1	/*
				2	* Copyright 2001-2004 Unicode, Inc.
				3	*
				4	* Disclaimer
				5	*
				6	* This source code is provided as is by Unicode, Inc. No claims are
				7	* made as to fitness for any particular purpose. No warranties of any
				8	* kind are expressed or implied. The recipient agrees to determine
				9	* applicability of information provided. If this file has been
				10	* purchased on magnetic or optical media from Unicode, Inc., the
				11	* sole remedy for any claim will be exchange of defective media
				12	* within 90 days of receipt.
				13	*
				14	* Limitations on Rights to Redistribute This Code
				15	*
				16	* Unicode, Inc. hereby grants the right to freely use the information
				17	* supplied in this file in the creation of products supporting the
				18	* Unicode Standard, and to make copies of this file in any form
				19	* for internal or external distribution as long as this notice
				20	* remains attached.
				21	*/
				22
				23	/* ---------------------------------------------------------------------
				24
				25	Conversions between UTF32, UTF-16, and UTF-8. Source code file.
				26	Author: Mark E. Davis, 1994.
				27	Rev History: Rick McGowan, fixes & updates May 2001.
				28	Sept 2001: fixed const & error conditions per
				29	mods suggested by S. Parent & A. Lillich.
				30	June 2002: Tim Dodd added detection and handling of incomplete
				31	source sequences, enhanced error detection, added casts
				32	to eliminate compiler warnings.
				33	July 2003: slight mods to back out aggressive FFFE detection.
				34	Jan 2004: updated switches in from-UTF8 conversions.
				35	Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
				36
				37	See the header file "ConvertUTF.h" for complete documentation.
				38
				39	------------------------------------------------------------------------ */
				40
				41
				42	#include "ConvertUTF.h"
				43	#ifdef CVTUTF_DEBUG
				44	#include <stdio.h>
				45	#endif
				46
				47	static const int halfShift = 10; /* used for shifting by 10 bits */
				48
				49	static const UTF32 halfBase = 0x0010000UL;
				50	static const UTF32 halfMask = 0x3FFUL;
				51
				52	#define UNI_SUR_HIGH_START (UTF32)0xD800
				53	#define UNI_SUR_HIGH_END (UTF32)0xDBFF
				54	#define UNI_SUR_LOW_START (UTF32)0xDC00
				55	#define UNI_SUR_LOW_END (UTF32)0xDFFF
				56	#define false 0
				57	#define true 1
				58
				59	/* --------------------------------------------------------------------- */
				60
				61	ConversionResult ConvertUTF32toUTF16 (
				62	const UTF32** sourceStart, const UTF32* sourceEnd,
				63	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
				64	ConversionResult result = conversionOK;
				65	const UTF32* source = *sourceStart;
				66	UTF16* target = *targetStart;
				67	while (source < sourceEnd) {
				68	UTF32 ch;
				69	if (target >= targetEnd) {
				70	result = targetExhausted; break;
				71	}
				72	ch = *source++;
				73	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				74	/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
				75	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				76	if (flags == strictConversion) {
				77	--source; /* return to the illegal value itself */
				78	result = sourceIllegal;
				79	break;
				80	} else {
				81	*target++ = UNI_REPLACEMENT_CHAR;
				82	}
				83	} else {
				84	target++ = (UTF16)ch; / normal case */
				85	}
				86	} else if (ch > UNI_MAX_LEGAL_UTF32) {
				87	if (flags == strictConversion) {
				88	result = sourceIllegal;
				89	} else {
				90	*target++ = UNI_REPLACEMENT_CHAR;
				91	}
				92	} else {
				93	/* target is a character in range 0xFFFF - 0x10FFFF. */
				94	if (target + 1 >= targetEnd) {
				95	--source; /* Back up source pointer! */
				96	result = targetExhausted; break;
				97	}
				98	ch -= halfBase;
				99	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
				100	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
				101	}
				102	}
				103	*sourceStart = source;
				104	*targetStart = target;
				105	return result;
				106	}
				107
				108	/* --------------------------------------------------------------------- */
				109
				110	ConversionResult ConvertUTF16toUTF32 (
				111	const UTF16** sourceStart, const UTF16* sourceEnd,
				112	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
				113	ConversionResult result = conversionOK;
				114	const UTF16* source = *sourceStart;
				115	UTF32* target = *targetStart;
				116	UTF32 ch, ch2;
				117	while (source < sourceEnd) {
				118	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
				119	ch = *source++;
				120	/* If we have a surrogate pair, convert to UTF32 first. */
				121	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				122	/* If the 16 bits following the high surrogate are in the source buffer... */
				123	if (source < sourceEnd) {
				124	ch2 = *source;
				125	/* If it's a low surrogate, convert to UTF32. */
				126	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				127	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				128	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				129	++source;
				130	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				131	--source; /* return to the illegal value itself */
				132	result = sourceIllegal;
				133	break;
				134	}
				135	} else { /* We don't have the 16 bits following the high surrogate. */
				136	--source; /* return to the high surrogate */
				137	result = sourceExhausted;
				138	break;
				139	}
				140	} else if (flags == strictConversion) {
				141	/* UTF-16 surrogate values are illegal in UTF-32 */
				142	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				143	--source; /* return to the illegal value itself */
				144	result = sourceIllegal;
				145	break;
				146	}
				147	}
				148	if (target >= targetEnd) {
				149	source = oldSource; /* Back up source pointer! */
				150	result = targetExhausted; break;
				151	}
				152	*target++ = ch;
				153	}
				154	*sourceStart = source;
				155	*targetStart = target;
				156	#ifdef CVTUTF_DEBUG
				157	if (result == sourceIllegal) {
				158	fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
				159	fflush(stderr);
				160	}
				161	#endif
				162	return result;
				163	}
				164
				165	/* --------------------------------------------------------------------- */
				166
				167	/*
				168	* Index into the table below with the first byte of a UTF-8 sequence to
				169	* get the number of trailing bytes that are supposed to follow it.
				170	* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
				171	* left as-is for anyone who may want to do such conversion, which was
				172	* allowed in earlier algorithms.
				173	*/
				174	static const char trailingBytesForUTF8[256] = {
				175	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				176	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				177	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				178	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				179	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				180	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				181	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				182	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
				183	};
				184
				185	/*
				186	* Magic values subtracted from a buffer value during UTF8 conversion.
				187	* This table contains as many values as there might be trailing bytes
				188	* in a UTF-8 sequence.
				189	*/
				190	static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
				191	0x03C82080UL, 0xFA082080UL, 0x82082080UL };
				192
				193	/*
				194	* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
				195	* into the first byte, depending on how many bytes follow. There are
				196	* as many entries in this table as there are UTF-8 sequence types.
				197	* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
				198	* for legal UTF-8 will be 4 or fewer bytes total.
				199	*/
				200	static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				201
				202	/* --------------------------------------------------------------------- */
				203
				204	/* The interface converts a whole buffer to avoid function-call overhead.
				205	* Constants have been gathered. Loops & conditionals have been removed as
				206	* much as possible for efficiency, in favor of drop-through switches.
				207	* (See "Note A" at the bottom of the file for equivalent code.)
				208	* If your compiler supports it, the "isLegalUTF8" call can be turned
				209	* into an inline function.
				210	*/
				211
				212	/* --------------------------------------------------------------------- */
				213
				214	ConversionResult ConvertUTF16toUTF8 (
				215	const UTF16** sourceStart, const UTF16* sourceEnd,
				216	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
				217	ConversionResult result = conversionOK;
				218	const UTF16* source = *sourceStart;
				219	UTF8* target = *targetStart;
				220	while (source < sourceEnd) {
				221	UTF32 ch;
				222	unsigned short bytesToWrite = 0;
				223	const UTF32 byteMask = 0xBF;
				224	const UTF32 byteMark = 0x80;
				225	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
				226	ch = *source++;
				227	/* If we have a surrogate pair, convert to UTF32 first. */
				228	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				229	/* If the 16 bits following the high surrogate are in the source buffer... */
				230	if (source < sourceEnd) {
				231	UTF32 ch2 = *source;
				232	/* If it's a low surrogate, convert to UTF32. */
				233	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				234	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				235	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				236	++source;
				237	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				238	--source; /* return to the illegal value itself */
				239	result = sourceIllegal;
				240	break;
				241	}
				242	} else { /* We don't have the 16 bits following the high surrogate. */
				243	--source; /* return to the high surrogate */
				244	result = sourceExhausted;
				245	break;
				246	}
				247	} else if (flags == strictConversion) {
				248	/* UTF-16 surrogate values are illegal in UTF-32 */
				249	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				250	--source; /* return to the illegal value itself */
				251	result = sourceIllegal;
				252	break;
				253	}
				254	}
				255
				256	// TPN: substitute all control characters except for NULL, TAB, LF or CR
				257	if (ch && (ch != (UTF32)0x09) && (ch != (UTF32)0x0a) && (ch != (UTF32)0x0d) && (ch < (UTF32)0x20) ) {
				258	ch = (UTF32)0x3f;
				259	}
				260	// TPN: filter out byte order marks and invalid character 0xFFFF
				261	if((ch == (UTF32)0xFEFF) \|\| (ch == (UTF32)0xFFFE)\|\| (ch == (UTF32)0xFFFF)) {
				262	continue;
				263	}
				264
				265	/* Figure out how many bytes the result will require */
				266	if (ch < (UTF32)0x80) { bytesToWrite = 1;
				267	} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
				268	} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
				269	} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
				270	} else { bytesToWrite = 3;
				271	ch = UNI_REPLACEMENT_CHAR;
				272	}
				273
				274	target += bytesToWrite;
				275	if (target > targetEnd) {
				276	source = oldSource; /* Back up source pointer! */
				277	target -= bytesToWrite; result = targetExhausted; break;
				278	}
				279	switch (bytesToWrite) { /* note: everything falls through. */
				280	case 4: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				281	case 3: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				282	case 2: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				283	case 1: *--target = (UTF8)(ch \| firstByteMark[bytesToWrite]);
				284	}
				285	target += bytesToWrite;
				286	}
				287	*sourceStart = source;
				288	*targetStart = target;
				289	return result;
				290	}
				291
				292	/* --------------------------------------------------------------------- */
				293
				294	/*
				295	* Utility routine to tell whether a sequence of bytes is legal UTF-8.
				296	* This must be called with the length pre-determined by the first byte.
				297	* If not calling this from ConvertUTF8to*, then the length can be set by:
				298	* length = trailingBytesForUTF8[*source]+1;
				299	* and the sequence is illegal right away if there aren't that many bytes
				300	* available.
				301	* If presented with a length > 4, this returns false. The Unicode
				302	* definition of UTF-8 goes up to 4-byte sequences.
				303	*/
				304
				305	inline Boolean isLegalUTF8(const UTF8 *source, int length) {
				306	UTF8 a;
				307	const UTF8 *srcptr = source+length;
				308	switch (length) {
				309	default: return false;
				310	/* Everything else falls through when "true"... */
				311	case 4: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				312	case 3: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				313	case 2: if ((a = (*--srcptr)) > 0xBF) return false;
				314
				315	switch (*source) {
				316	/* no fall-through in this inner switch */
				317	case 0xE0: if (a < 0xA0) return false; break;
				318	case 0xED: if (a > 0x9F) return false; break;
				319	case 0xF0: if (a < 0x90) return false; break;
				320	case 0xF4: if (a > 0x8F) return false; break;
				321	default: if (a < 0x80) return false;
				322	}
				323
				324	case 1: if (source >= 0x80 && source < 0xC2) return false;
				325	}
				326	if (*source > 0xF4) return false;
				327	return true;
				328	}
				329
				330	/* --------------------------------------------------------------------- */
				331
				332	/*
				333	* Exported function to return whether a UTF-8 sequence is legal or not.
				334	* This is not used here; it's just exported.
				335	*/
				336	Boolean isLegalUTF8Sequence(const UTF8 source, const UTF8 sourceEnd) {
				337	int length = trailingBytesForUTF8[*source]+1;
				338	if (source+length > sourceEnd) {
				339	return false;
				340	}
				341	return isLegalUTF8(source, length);
				342	}
				343
				344	/* --------------------------------------------------------------------- */
				345
				346	ConversionResult ConvertUTF8toUTF16 (
				347	const UTF8** sourceStart, const UTF8* sourceEnd,
				348	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
				349	ConversionResult result = conversionOK;
				350	const UTF8* source = *sourceStart;
				351	UTF16* target = *targetStart;
				352	while (source < sourceEnd) {
				353	UTF32 ch = 0;
				354	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
				355	if (source + extraBytesToRead >= sourceEnd) {
				356	result = sourceExhausted; break;
				357	}
				358	/* Do this check whether lenient or strict */
				359	if (! isLegalUTF8(source, extraBytesToRead+1)) {
				360	result = sourceIllegal;
				361	break;
				362	}
				363	/*
				364	* The cases all fall through. See "Note A" below.
				365	*/
				366	switch (extraBytesToRead) {
				367	case 5: ch += source++; ch <<= 6; / remember, illegal UTF-8 */
				368	case 4: ch += source++; ch <<= 6; / remember, illegal UTF-8 */
				369	case 3: ch += *source++; ch <<= 6;
				370	case 2: ch += *source++; ch <<= 6;
				371	case 1: ch += *source++; ch <<= 6;
				372	case 0: ch += *source++;
				373	}
				374	ch -= offsetsFromUTF8[extraBytesToRead];
				375
				376	if (target >= targetEnd) {
				377	source -= (extraBytesToRead+1); /* Back up source pointer! */
				378	result = targetExhausted; break;
				379	}
				380	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				381	/* UTF-16 surrogate values are illegal in UTF-32 */
				382	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				383	if (flags == strictConversion) {
				384	source -= (extraBytesToRead+1); /* return to the illegal value itself */
				385	result = sourceIllegal;
				386	break;
				387	} else {
				388	*target++ = UNI_REPLACEMENT_CHAR;
				389	}
				390	} else {
				391	target++ = (UTF16)ch; / normal case */
				392	}
				393	} else if (ch > UNI_MAX_UTF16) {
				394	if (flags == strictConversion) {
				395	result = sourceIllegal;
				396	source -= (extraBytesToRead+1); /* return to the start */
				397	break; /* Bail out; shouldn't continue */
				398	} else {
				399	*target++ = UNI_REPLACEMENT_CHAR;
				400	}
				401	} else {
				402	/* target is a character in range 0xFFFF - 0x10FFFF. */
				403	if (target + 1 >= targetEnd) {
				404	source -= (extraBytesToRead+1); /* Back up source pointer! */
				405	result = targetExhausted; break;
				406	}
				407	ch -= halfBase;
				408	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
				409	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
				410	}
				411	}
				412	*sourceStart = source;
				413	*targetStart = target;
				414	return result;
				415	}
				416
				417	/* --------------------------------------------------------------------- */
				418
				419	ConversionResult ConvertUTF32toUTF8 (
				420	const UTF32** sourceStart, const UTF32* sourceEnd,
				421	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
				422	ConversionResult result = conversionOK;
				423	const UTF32* source = *sourceStart;
				424	UTF8* target = *targetStart;
				425	while (source < sourceEnd) {
				426	UTF32 ch;
				427	unsigned short bytesToWrite = 0;
				428	const UTF32 byteMask = 0xBF;
				429	const UTF32 byteMark = 0x80;
				430	ch = *source++;
				431	if (flags == strictConversion ) {
				432	/* UTF-16 surrogate values are illegal in UTF-32 */
				433	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				434	--source; /* return to the illegal value itself */
				435	result = sourceIllegal;
				436	break;
				437	}
				438	}
				439	/*
				440	* Figure out how many bytes the result will require. Turn any
				441	* illegally large UTF32 things (> Plane 17) into replacement chars.
				442	*/
				443	if (ch < (UTF32)0x80) { bytesToWrite = 1;
				444	} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
				445	} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
				446	} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
				447	} else { bytesToWrite = 3;
				448	ch = UNI_REPLACEMENT_CHAR;
				449	result = sourceIllegal;
				450	}
				451
				452	target += bytesToWrite;
				453	if (target > targetEnd) {
				454	--source; /* Back up source pointer! */
				455	target -= bytesToWrite; result = targetExhausted; break;
				456	}
				457	switch (bytesToWrite) { /* note: everything falls through. */
				458	case 4: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				459	case 3: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				460	case 2: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				461	case 1: *--target = (UTF8) (ch \| firstByteMark[bytesToWrite]);
				462	}
				463	target += bytesToWrite;
				464	}
				465	*sourceStart = source;
				466	*targetStart = target;
				467	return result;
				468	}
				469
				470	/* --------------------------------------------------------------------- */
				471
				472	ConversionResult ConvertUTF8toUTF32 (
				473	const UTF8** sourceStart, const UTF8* sourceEnd,
				474	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
				475	ConversionResult result = conversionOK;
				476	const UTF8* source = *sourceStart;
				477	UTF32* target = *targetStart;
				478	while (source < sourceEnd) {
				479	UTF32 ch = 0;
				480	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
				481	if (source + extraBytesToRead >= sourceEnd) {
				482	result = sourceExhausted; break;
				483	}
				484	/* Do this check whether lenient or strict */
				485	if (! isLegalUTF8(source, extraBytesToRead+1)) {
				486	result = sourceIllegal;
				487	break;
				488	}
				489	/*
				490	* The cases all fall through. See "Note A" below.
				491	*/
				492	switch (extraBytesToRead) {
				493	case 5: ch += *source++; ch <<= 6;
				494	case 4: ch += *source++; ch <<= 6;
				495	case 3: ch += *source++; ch <<= 6;
				496	case 2: ch += *source++; ch <<= 6;
				497	case 1: ch += *source++; ch <<= 6;
				498	case 0: ch += *source++;
				499	}
				500	ch -= offsetsFromUTF8[extraBytesToRead];
				501
				502	if (target >= targetEnd) {
				503	source -= (extraBytesToRead+1); /* Back up the source pointer! */
				504	result = targetExhausted; break;
				505	}
				506	if (ch <= UNI_MAX_LEGAL_UTF32) {
				507	/*
				508	* UTF-16 surrogate values are illegal in UTF-32, and anything
				509	* over Plane 17 (> 0x10FFFF) is illegal.
				510	*/
				511	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				512	if (flags == strictConversion) {
				513	source -= (extraBytesToRead+1); /* return to the illegal value itself */
				514	result = sourceIllegal;
				515	break;
				516	} else {
				517	*target++ = UNI_REPLACEMENT_CHAR;
				518	}
				519	} else {
				520	*target++ = ch;
				521	}
				522	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
				523	result = sourceIllegal;
				524	*target++ = UNI_REPLACEMENT_CHAR;
				525	}
				526	}
				527	*sourceStart = source;
				528	*targetStart = target;
				529	return result;
				530	}
				531
				532	/* ---------------------------------------------------------------------
				533
				534	Note A.
				535	The fall-through switches in UTF-8 reading code save a
				536	temp variable, some decrements & conditionals. The switches
				537	are equivalent to the following loop:
				538	{
				539	int tmpBytesToRead = extraBytesToRead+1;
				540	do {
				541	ch += *source++;
				542	--tmpBytesToRead;
				543	if (tmpBytesToRead) ch <<= 6;
				544	} while (tmpBytesToRead > 0);
				545	}
				546	In UTF-8 writing code, the switches on "bytesToWrite" are
				547	similarly unrolled loops.
				548
				549	--------------------------------------------------------------------- */