Blame - lib/Basic/ConvertUTF.c - fp2-dev/platform/external/clang

blob: 124e386c5526507825bb97511a627209bc1bfb3b [file] [log] [blame]

Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	1	/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
				2	*
				3	* The LLVM Compiler Infrastructure
				4	*
				5	* This file is distributed under the University of Illinois Open Source
				6	* License. See LICENSE.TXT for details.
				7	*
				8	===------------------------------------------------------------------------=/
				9	/*
				10	* Copyright 2001-2004 Unicode, Inc.
				11	*
				12	* Disclaimer
				13	*
				14	* This source code is provided as is by Unicode, Inc. No claims are
				15	* made as to fitness for any particular purpose. No warranties of any
				16	* kind are expressed or implied. The recipient agrees to determine
				17	* applicability of information provided. If this file has been
				18	* purchased on magnetic or optical media from Unicode, Inc., the
				19	* sole remedy for any claim will be exchange of defective media
				20	* within 90 days of receipt.
				21	*
				22	* Limitations on Rights to Redistribute This Code
				23	*
				24	* Unicode, Inc. hereby grants the right to freely use the information
				25	* supplied in this file in the creation of products supporting the
				26	* Unicode Standard, and to make copies of this file in any form
				27	* for internal or external distribution as long as this notice
				28	* remains attached.
				29	*/
				30
				31	/* ---------------------------------------------------------------------
				32
				33	Conversions between UTF32, UTF-16, and UTF-8. Source code file.
				34	Author: Mark E. Davis, 1994.
				35	Rev History: Rick McGowan, fixes & updates May 2001.
				36	Sept 2001: fixed const & error conditions per
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	37	mods suggested by S. Parent & A. Lillich.
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	38	June 2002: Tim Dodd added detection and handling of incomplete
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	39	source sequences, enhanced error detection, added casts
				40	to eliminate compiler warnings.
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	41	July 2003: slight mods to back out aggressive FFFE detection.
				42	Jan 2004: updated switches in from-UTF8 conversions.
				43	Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
				44
				45	See the header file "ConvertUTF.h" for complete documentation.
				46
				47	------------------------------------------------------------------------ */
				48
				49
				50	#include "clang/Basic/ConvertUTF.h"
				51	#ifdef CVTUTF_DEBUG
				52	#include <stdio.h>
				53	#endif
				54
				55	static const int halfShift = 10; /* used for shifting by 10 bits */
				56
				57	static const UTF32 halfBase = 0x0010000UL;
				58	static const UTF32 halfMask = 0x3FFUL;
				59
				60	#define UNI_SUR_HIGH_START (UTF32)0xD800
				61	#define UNI_SUR_HIGH_END (UTF32)0xDBFF
				62	#define UNI_SUR_LOW_START (UTF32)0xDC00
				63	#define UNI_SUR_LOW_END (UTF32)0xDFFF
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	64	#define false 0
				65	#define true 1
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	66
				67	/* --------------------------------------------------------------------- */
				68
				69	/*
				70	* Index into the table below with the first byte of a UTF-8 sequence to
				71	* get the number of trailing bytes that are supposed to follow it.
				72	* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
				73	* left as-is for anyone who may want to do such conversion, which was
				74	* allowed in earlier algorithms.
				75	*/
				76	static const char trailingBytesForUTF8[256] = {
				77	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				78	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				79	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				80	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				81	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				82	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				83	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				84	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
				85	};
				86
				87	/*
				88	* Magic values subtracted from a buffer value during UTF8 conversion.
				89	* This table contains as many values as there might be trailing bytes
				90	* in a UTF-8 sequence.
				91	*/
				92	static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	93	0x03C82080UL, 0xFA082080UL, 0x82082080UL };
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	94
				95	/*
				96	* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
				97	* into the first byte, depending on how many bytes follow. There are
				98	* as many entries in this table as there are UTF-8 sequence types.
				99	* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
				100	* for legal UTF-8 will be 4 or fewer bytes total.
				101	*/
				102	static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				103
				104	/* --------------------------------------------------------------------- */
				105
				106	/* The interface converts a whole buffer to avoid function-call overhead.
				107	* Constants have been gathered. Loops & conditionals have been removed as
				108	* much as possible for efficiency, in favor of drop-through switches.
				109	* (See "Note A" at the bottom of the file for equivalent code.)
				110	* If your compiler supports it, the "isLegalUTF8" call can be turned
				111	* into an inline function.
				112	*/
				113
				114	#ifdef CLANG_NEEDS_THESE_ONE_DAY
				115
				116	/* --------------------------------------------------------------------- */
				117
				118	ConversionResult ConvertUTF32toUTF16 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	119	const UTF32** sourceStart, const UTF32* sourceEnd,
				120	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	121	ConversionResult result = conversionOK;
				122	const UTF32* source = *sourceStart;
				123	UTF16* target = *targetStart;
				124	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	125	UTF32 ch;
				126	if (target >= targetEnd) {
				127	result = targetExhausted; break;
				128	}
				129	ch = *source++;
				130	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				131	/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
				132	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				133	if (flags == strictConversion) {
				134	--source; /* return to the illegal value itself */
				135	result = sourceIllegal;
				136	break;
				137	} else {
				138	*target++ = UNI_REPLACEMENT_CHAR;
				139	}
				140	} else {
				141	target++ = (UTF16)ch; / normal case */
				142	}
				143	} else if (ch > UNI_MAX_LEGAL_UTF32) {
				144	if (flags == strictConversion) {
				145	result = sourceIllegal;
				146	} else {
				147	*target++ = UNI_REPLACEMENT_CHAR;
				148	}
				149	} else {
				150	/* target is a character in range 0xFFFF - 0x10FFFF. */
				151	if (target + 1 >= targetEnd) {
				152	--source; /* Back up source pointer! */
				153	result = targetExhausted; break;
				154	}
				155	ch -= halfBase;
				156	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
				157	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
				158	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	159	}
				160	*sourceStart = source;
				161	*targetStart = target;
				162	return result;
				163	}
				164
				165	/* --------------------------------------------------------------------- */
				166
				167	ConversionResult ConvertUTF16toUTF32 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	168	const UTF16** sourceStart, const UTF16* sourceEnd,
				169	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	170	ConversionResult result = conversionOK;
				171	const UTF16* source = *sourceStart;
				172	UTF32* target = *targetStart;
				173	UTF32 ch, ch2;
				174	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	175	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
				176	ch = *source++;
				177	/* If we have a surrogate pair, convert to UTF32 first. */
				178	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				179	/* If the 16 bits following the high surrogate are in the source buffer... */
				180	if (source < sourceEnd) {
				181	ch2 = *source;
				182	/* If it's a low surrogate, convert to UTF32. */
				183	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				184	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				185	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				186	++source;
				187	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				188	--source; /* return to the illegal value itself */
				189	result = sourceIllegal;
				190	break;
				191	}
				192	} else { /* We don't have the 16 bits following the high surrogate. */
				193	--source; /* return to the high surrogate */
				194	result = sourceExhausted;
				195	break;
				196	}
				197	} else if (flags == strictConversion) {
				198	/* UTF-16 surrogate values are illegal in UTF-32 */
				199	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				200	--source; /* return to the illegal value itself */
				201	result = sourceIllegal;
				202	break;
				203	}
				204	}
				205	if (target >= targetEnd) {
				206	source = oldSource; /* Back up source pointer! */
				207	result = targetExhausted; break;
				208	}
				209	*target++ = ch;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	210	}
				211	*sourceStart = source;
				212	*targetStart = target;
				213	#ifdef CVTUTF_DEBUG
				214	if (result == sourceIllegal) {
				215	fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
				216	fflush(stderr);
				217	}
				218	#endif
				219	return result;
				220	}
				221	ConversionResult ConvertUTF16toUTF8 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	222	const UTF16** sourceStart, const UTF16* sourceEnd,
				223	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	224	ConversionResult result = conversionOK;
				225	const UTF16* source = *sourceStart;
				226	UTF8* target = *targetStart;
				227	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	228	UTF32 ch;
				229	unsigned short bytesToWrite = 0;
				230	const UTF32 byteMask = 0xBF;
				231	const UTF32 byteMark = 0x80;
				232	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
				233	ch = *source++;
				234	/* If we have a surrogate pair, convert to UTF32 first. */
				235	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				236	/* If the 16 bits following the high surrogate are in the source buffer... */
				237	if (source < sourceEnd) {
				238	UTF32 ch2 = *source;
				239	/* If it's a low surrogate, convert to UTF32. */
				240	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				241	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				242	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				243	++source;
				244	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				245	--source; /* return to the illegal value itself */
				246	result = sourceIllegal;
				247	break;
				248	}
				249	} else { /* We don't have the 16 bits following the high surrogate. */
				250	--source; /* return to the high surrogate */
				251	result = sourceExhausted;
				252	break;
				253	}
				254	} else if (flags == strictConversion) {
				255	/* UTF-16 surrogate values are illegal in UTF-32 */
				256	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				257	--source; /* return to the illegal value itself */
				258	result = sourceIllegal;
				259	break;
				260	}
				261	}
				262	/* Figure out how many bytes the result will require */
				263	if (ch < (UTF32)0x80) { bytesToWrite = 1;
				264	} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
				265	} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
				266	} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
				267	} else { bytesToWrite = 3;
				268	ch = UNI_REPLACEMENT_CHAR;
				269	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	270
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	271	target += bytesToWrite;
				272	if (target > targetEnd) {
				273	source = oldSource; /* Back up source pointer! */
				274	target -= bytesToWrite; result = targetExhausted; break;
				275	}
				276	switch (bytesToWrite) { /* note: everything falls through. */
				277	case 4: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				278	case 3: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				279	case 2: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				280	case 1: *--target = (UTF8)(ch \| firstByteMark[bytesToWrite]);
				281	}
				282	target += bytesToWrite;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	283	}
				284	*sourceStart = source;
				285	*targetStart = target;
				286	return result;
				287	}
				288
				289	/* --------------------------------------------------------------------- */
				290
				291	ConversionResult ConvertUTF32toUTF8 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	292	const UTF32** sourceStart, const UTF32* sourceEnd,
				293	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	294	ConversionResult result = conversionOK;
				295	const UTF32* source = *sourceStart;
				296	UTF8* target = *targetStart;
				297	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	298	UTF32 ch;
				299	unsigned short bytesToWrite = 0;
				300	const UTF32 byteMask = 0xBF;
				301	const UTF32 byteMark = 0x80;
				302	ch = *source++;
				303	if (flags == strictConversion ) {
				304	/* UTF-16 surrogate values are illegal in UTF-32 */
				305	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				306	--source; /* return to the illegal value itself */
				307	result = sourceIllegal;
				308	break;
				309	}
				310	}
				311	/*
				312	* Figure out how many bytes the result will require. Turn any
				313	* illegally large UTF32 things (> Plane 17) into replacement chars.
				314	*/
				315	if (ch < (UTF32)0x80) { bytesToWrite = 1;
				316	} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
				317	} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
				318	} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
				319	} else { bytesToWrite = 3;
				320	ch = UNI_REPLACEMENT_CHAR;
				321	result = sourceIllegal;
				322	}
				323
				324	target += bytesToWrite;
				325	if (target > targetEnd) {
				326	--source; /* Back up source pointer! */
				327	target -= bytesToWrite; result = targetExhausted; break;
				328	}
				329	switch (bytesToWrite) { /* note: everything falls through. */
				330	case 4: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				331	case 3: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				332	case 2: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				333	case 1: *--target = (UTF8) (ch \| firstByteMark[bytesToWrite]);
				334	}
				335	target += bytesToWrite;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	336	}
				337	*sourceStart = source;
				338	*targetStart = target;
				339	return result;
				340	}
				341
				342	/* --------------------------------------------------------------------- */
				343
				344	ConversionResult ConvertUTF8toUTF32 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	345	const UTF8** sourceStart, const UTF8* sourceEnd,
				346	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	347	ConversionResult result = conversionOK;
				348	const UTF8* source = *sourceStart;
				349	UTF32* target = *targetStart;
				350	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	351	UTF32 ch = 0;
				352	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
				353	if (source + extraBytesToRead >= sourceEnd) {
				354	result = sourceExhausted; break;
				355	}
				356	/* Do this check whether lenient or strict */
				357	if (!isLegalUTF8(source, extraBytesToRead+1)) {
				358	result = sourceIllegal;
				359	break;
				360	}
				361	/*
				362	* The cases all fall through. See "Note A" below.
				363	*/
				364	switch (extraBytesToRead) {
				365	case 5: ch += *source++; ch <<= 6;
				366	case 4: ch += *source++; ch <<= 6;
				367	case 3: ch += *source++; ch <<= 6;
				368	case 2: ch += *source++; ch <<= 6;
				369	case 1: ch += *source++; ch <<= 6;
				370	case 0: ch += *source++;
				371	}
				372	ch -= offsetsFromUTF8[extraBytesToRead];
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	373
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	374	if (target >= targetEnd) {
				375	source -= (extraBytesToRead+1); /* Back up the source pointer! */
				376	result = targetExhausted; break;
				377	}
				378	if (ch <= UNI_MAX_LEGAL_UTF32) {
				379	/*
				380	* UTF-16 surrogate values are illegal in UTF-32, and anything
				381	* over Plane 17 (> 0x10FFFF) is illegal.
				382	*/
				383	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				384	if (flags == strictConversion) {
				385	source -= (extraBytesToRead+1); /* return to the illegal value itself */
				386	result = sourceIllegal;
				387	break;
				388	} else {
				389	*target++ = UNI_REPLACEMENT_CHAR;
				390	}
				391	} else {
				392	*target++ = ch;
				393	}
				394	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
				395	result = sourceIllegal;
				396	*target++ = UNI_REPLACEMENT_CHAR;
				397	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	398	}
				399	*sourceStart = source;
				400	*targetStart = target;
				401	return result;
				402	}
				403	#endif
				404
				405	/* --------------------------------------------------------------------- */
				406
				407	/*
				408	* Utility routine to tell whether a sequence of bytes is legal UTF-8.
				409	* This must be called with the length pre-determined by the first byte.
				410	* If not calling this from ConvertUTF8to*, then the length can be set by:
				411	* length = trailingBytesForUTF8[*source]+1;
				412	* and the sequence is illegal right away if there aren't that many bytes
				413	* available.
				414	* If presented with a length > 4, this returns false. The Unicode
				415	* definition of UTF-8 goes up to 4-byte sequences.
				416	*/
				417
				418	static Boolean isLegalUTF8(const UTF8 *source, int length) {
				419	UTF8 a;
				420	const UTF8 *srcptr = source+length;
				421	switch (length) {
				422	default: return false;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	423	/* Everything else falls through when "true"... */
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	424	case 4: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				425	case 3: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				426	case 2: if ((a = (*--srcptr)) > 0xBF) return false;
				427
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	428	switch (*source) {
				429	/* no fall-through in this inner switch */
				430	case 0xE0: if (a < 0xA0) return false; break;
				431	case 0xED: if (a > 0x9F) return false; break;
				432	case 0xF0: if (a < 0x90) return false; break;
				433	case 0xF4: if (a > 0x8F) return false; break;
				434	default: if (a < 0x80) return false;
				435	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	436
				437	case 1: if (source >= 0x80 && source < 0xC2) return false;
				438	}
				439	if (*source > 0xF4) return false;
				440	return true;
				441	}
				442
				443	/* --------------------------------------------------------------------- */
				444
				445	/*
				446	* Exported function to return whether a UTF-8 sequence is legal or not.
				447	* This is not used here; it's just exported.
				448	*/
				449	Boolean isLegalUTF8Sequence(const UTF8 source, const UTF8 sourceEnd) {
				450	int length = trailingBytesForUTF8[*source]+1;
				451	if (source+length > sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	452	return false;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	453	}
				454	return isLegalUTF8(source, length);
				455	}
				456
				457	/* --------------------------------------------------------------------- */
				458
				459	ConversionResult ConvertUTF8toUTF16 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	460	const UTF8** sourceStart, const UTF8* sourceEnd,
				461	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	462	ConversionResult result = conversionOK;
				463	const UTF8* source = *sourceStart;
				464	UTF16* target = *targetStart;
				465	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	466	UTF32 ch = 0;
				467	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
				468	if (source + extraBytesToRead >= sourceEnd) {
				469	result = sourceExhausted; break;
				470	}
				471	/* Do this check whether lenient or strict */
				472	if (!isLegalUTF8(source, extraBytesToRead+1)) {
				473	result = sourceIllegal;
				474	break;
				475	}
				476	/*
				477	* The cases all fall through. See "Note A" below.
				478	*/
				479	switch (extraBytesToRead) {
				480	case 5: ch += source++; ch <<= 6; / remember, illegal UTF-8 */
				481	case 4: ch += source++; ch <<= 6; / remember, illegal UTF-8 */
				482	case 3: ch += *source++; ch <<= 6;
				483	case 2: ch += *source++; ch <<= 6;
				484	case 1: ch += *source++; ch <<= 6;
				485	case 0: ch += *source++;
				486	}
				487	ch -= offsetsFromUTF8[extraBytesToRead];
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	488
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	489	if (target >= targetEnd) {
				490	source -= (extraBytesToRead+1); /* Back up source pointer! */
				491	result = targetExhausted; break;
				492	}
				493	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				494	/* UTF-16 surrogate values are illegal in UTF-32 */
				495	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				496	if (flags == strictConversion) {
				497	source -= (extraBytesToRead+1); /* return to the illegal value itself */
				498	result = sourceIllegal;
				499	break;
				500	} else {
				501	*target++ = UNI_REPLACEMENT_CHAR;
				502	}
				503	} else {
				504	target++ = (UTF16)ch; / normal case */
				505	}
				506	} else if (ch > UNI_MAX_UTF16) {
				507	if (flags == strictConversion) {
				508	result = sourceIllegal;
				509	source -= (extraBytesToRead+1); /* return to the start */
				510	break; /* Bail out; shouldn't continue */
				511	} else {
				512	*target++ = UNI_REPLACEMENT_CHAR;
				513	}
				514	} else {
				515	/* target is a character in range 0xFFFF - 0x10FFFF. */
				516	if (target + 1 >= targetEnd) {
				517	source -= (extraBytesToRead+1); /* Back up source pointer! */
				518	result = targetExhausted; break;
				519	}
				520	ch -= halfBase;
				521	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
				522	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
				523	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	524	}
				525	*sourceStart = source;
				526	*targetStart = target;
				527	return result;
				528	}
				529
				530	/* ---------------------------------------------------------------------
				531
				532	Note A.
				533	The fall-through switches in UTF-8 reading code save a
				534	temp variable, some decrements & conditionals. The switches
				535	are equivalent to the following loop:
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	536	{
				537	int tmpBytesToRead = extraBytesToRead+1;
				538	do {
				539	ch += *source++;
				540	--tmpBytesToRead;
				541	if (tmpBytesToRead) ch <<= 6;
				542	} while (tmpBytesToRead > 0);
				543	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	544	In UTF-8 writing code, the switches on "bytesToWrite" are
				545	similarly unrolled loops.
				546
				547	--------------------------------------------------------------------- */