Blame - lib/Basic/ConvertUTF.c - fp2-dev/platform/external/clang

blob: 2e25e79c4c6e14dfef1fb75344bbe0c33b7d1f66 [file] [log] [blame]

Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	1	/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
				2	*
				3	* The LLVM Compiler Infrastructure
				4	*
				5	* This file is distributed under the University of Illinois Open Source
				6	* License. See LICENSE.TXT for details.
				7	*
				8	===------------------------------------------------------------------------=/
				9	/*
				10	* Copyright 2001-2004 Unicode, Inc.
				11	*
				12	* Disclaimer
				13	*
				14	* This source code is provided as is by Unicode, Inc. No claims are
				15	* made as to fitness for any particular purpose. No warranties of any
				16	* kind are expressed or implied. The recipient agrees to determine
				17	* applicability of information provided. If this file has been
				18	* purchased on magnetic or optical media from Unicode, Inc., the
				19	* sole remedy for any claim will be exchange of defective media
				20	* within 90 days of receipt.
				21	*
				22	* Limitations on Rights to Redistribute This Code
				23	*
				24	* Unicode, Inc. hereby grants the right to freely use the information
				25	* supplied in this file in the creation of products supporting the
				26	* Unicode Standard, and to make copies of this file in any form
				27	* for internal or external distribution as long as this notice
				28	* remains attached.
				29	*/
				30
				31	/* ---------------------------------------------------------------------
				32
				33	Conversions between UTF32, UTF-16, and UTF-8. Source code file.
				34	Author: Mark E. Davis, 1994.
				35	Rev History: Rick McGowan, fixes & updates May 2001.
				36	Sept 2001: fixed const & error conditions per
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	37	mods suggested by S. Parent & A. Lillich.
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	38	June 2002: Tim Dodd added detection and handling of incomplete
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	39	source sequences, enhanced error detection, added casts
				40	to eliminate compiler warnings.
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	41	July 2003: slight mods to back out aggressive FFFE detection.
				42	Jan 2004: updated switches in from-UTF8 conversions.
				43	Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
				44
				45	See the header file "ConvertUTF.h" for complete documentation.
				46
				47	------------------------------------------------------------------------ */
				48
				49
				50	#include "clang/Basic/ConvertUTF.h"
				51	#ifdef CVTUTF_DEBUG
				52	#include <stdio.h>
				53	#endif
				54
				55	static const int halfShift = 10; /* used for shifting by 10 bits */
				56
				57	static const UTF32 halfBase = 0x0010000UL;
				58	static const UTF32 halfMask = 0x3FFUL;
				59
				60	#define UNI_SUR_HIGH_START (UTF32)0xD800
				61	#define UNI_SUR_HIGH_END (UTF32)0xDBFF
				62	#define UNI_SUR_LOW_START (UTF32)0xDC00
				63	#define UNI_SUR_LOW_END (UTF32)0xDFFF
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	64	#define false 0
				65	#define true 1
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	66
				67	/* --------------------------------------------------------------------- */
				68
				69	/*
				70	* Index into the table below with the first byte of a UTF-8 sequence to
				71	* get the number of trailing bytes that are supposed to follow it.
				72	* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
				73	* left as-is for anyone who may want to do such conversion, which was
				74	* allowed in earlier algorithms.
				75	*/
				76	static const char trailingBytesForUTF8[256] = {
				77	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				78	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				79	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				80	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				81	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				82	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				83	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				84	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
				85	};
				86
				87	/*
				88	* Magic values subtracted from a buffer value during UTF8 conversion.
				89	* This table contains as many values as there might be trailing bytes
				90	* in a UTF-8 sequence.
				91	*/
				92	static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	93	0x03C82080UL, 0xFA082080UL, 0x82082080UL };
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	94
				95	/*
				96	* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
				97	* into the first byte, depending on how many bytes follow. There are
				98	* as many entries in this table as there are UTF-8 sequence types.
				99	* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
				100	* for legal UTF-8 will be 4 or fewer bytes total.
				101	*/
				102	static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				103
				104	/* --------------------------------------------------------------------- */
				105
				106	/* The interface converts a whole buffer to avoid function-call overhead.
				107	* Constants have been gathered. Loops & conditionals have been removed as
				108	* much as possible for efficiency, in favor of drop-through switches.
				109	* (See "Note A" at the bottom of the file for equivalent code.)
				110	* If your compiler supports it, the "isLegalUTF8" call can be turned
				111	* into an inline function.
				112	*/
				113
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	114
				115	/* --------------------------------------------------------------------- */
				116
				117	ConversionResult ConvertUTF32toUTF16 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	118	const UTF32** sourceStart, const UTF32* sourceEnd,
				119	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	120	ConversionResult result = conversionOK;
				121	const UTF32* source = *sourceStart;
				122	UTF16* target = *targetStart;
				123	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	124	UTF32 ch;
				125	if (target >= targetEnd) {
				126	result = targetExhausted; break;
				127	}
				128	ch = *source++;
				129	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				130	/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
				131	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				132	if (flags == strictConversion) {
				133	--source; /* return to the illegal value itself */
				134	result = sourceIllegal;
				135	break;
				136	} else {
				137	*target++ = UNI_REPLACEMENT_CHAR;
				138	}
				139	} else {
				140	target++ = (UTF16)ch; / normal case */
				141	}
				142	} else if (ch > UNI_MAX_LEGAL_UTF32) {
				143	if (flags == strictConversion) {
				144	result = sourceIllegal;
				145	} else {
				146	*target++ = UNI_REPLACEMENT_CHAR;
				147	}
				148	} else {
				149	/* target is a character in range 0xFFFF - 0x10FFFF. */
				150	if (target + 1 >= targetEnd) {
				151	--source; /* Back up source pointer! */
				152	result = targetExhausted; break;
				153	}
				154	ch -= halfBase;
				155	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
				156	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
				157	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	158	}
				159	*sourceStart = source;
				160	*targetStart = target;
				161	return result;
				162	}
				163
				164	/* --------------------------------------------------------------------- */
				165
				166	ConversionResult ConvertUTF16toUTF32 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	167	const UTF16** sourceStart, const UTF16* sourceEnd,
				168	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	169	ConversionResult result = conversionOK;
				170	const UTF16* source = *sourceStart;
				171	UTF32* target = *targetStart;
				172	UTF32 ch, ch2;
				173	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	174	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
				175	ch = *source++;
				176	/* If we have a surrogate pair, convert to UTF32 first. */
				177	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				178	/* If the 16 bits following the high surrogate are in the source buffer... */
				179	if (source < sourceEnd) {
				180	ch2 = *source;
				181	/* If it's a low surrogate, convert to UTF32. */
				182	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				183	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				184	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				185	++source;
				186	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				187	--source; /* return to the illegal value itself */
				188	result = sourceIllegal;
				189	break;
				190	}
				191	} else { /* We don't have the 16 bits following the high surrogate. */
				192	--source; /* return to the high surrogate */
				193	result = sourceExhausted;
				194	break;
				195	}
				196	} else if (flags == strictConversion) {
				197	/* UTF-16 surrogate values are illegal in UTF-32 */
				198	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				199	--source; /* return to the illegal value itself */
				200	result = sourceIllegal;
				201	break;
				202	}
				203	}
				204	if (target >= targetEnd) {
				205	source = oldSource; /* Back up source pointer! */
				206	result = targetExhausted; break;
				207	}
				208	*target++ = ch;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	209	}
				210	*sourceStart = source;
				211	*targetStart = target;
				212	#ifdef CVTUTF_DEBUG
				213	if (result == sourceIllegal) {
				214	fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
				215	fflush(stderr);
				216	}
				217	#endif
				218	return result;
				219	}
				220	ConversionResult ConvertUTF16toUTF8 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	221	const UTF16** sourceStart, const UTF16* sourceEnd,
				222	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	223	ConversionResult result = conversionOK;
				224	const UTF16* source = *sourceStart;
				225	UTF8* target = *targetStart;
				226	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	227	UTF32 ch;
				228	unsigned short bytesToWrite = 0;
				229	const UTF32 byteMask = 0xBF;
				230	const UTF32 byteMark = 0x80;
				231	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
				232	ch = *source++;
				233	/* If we have a surrogate pair, convert to UTF32 first. */
				234	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				235	/* If the 16 bits following the high surrogate are in the source buffer... */
				236	if (source < sourceEnd) {
				237	UTF32 ch2 = *source;
				238	/* If it's a low surrogate, convert to UTF32. */
				239	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				240	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				241	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				242	++source;
				243	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				244	--source; /* return to the illegal value itself */
				245	result = sourceIllegal;
				246	break;
				247	}
				248	} else { /* We don't have the 16 bits following the high surrogate. */
				249	--source; /* return to the high surrogate */
				250	result = sourceExhausted;
				251	break;
				252	}
				253	} else if (flags == strictConversion) {
				254	/* UTF-16 surrogate values are illegal in UTF-32 */
				255	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				256	--source; /* return to the illegal value itself */
				257	result = sourceIllegal;
				258	break;
				259	}
				260	}
				261	/* Figure out how many bytes the result will require */
				262	if (ch < (UTF32)0x80) { bytesToWrite = 1;
				263	} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
				264	} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
				265	} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
				266	} else { bytesToWrite = 3;
				267	ch = UNI_REPLACEMENT_CHAR;
				268	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	269
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	270	target += bytesToWrite;
				271	if (target > targetEnd) {
				272	source = oldSource; /* Back up source pointer! */
				273	target -= bytesToWrite; result = targetExhausted; break;
				274	}
				275	switch (bytesToWrite) { /* note: everything falls through. */
				276	case 4: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				277	case 3: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				278	case 2: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				279	case 1: *--target = (UTF8)(ch \| firstByteMark[bytesToWrite]);
				280	}
				281	target += bytesToWrite;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	282	}
				283	*sourceStart = source;
				284	*targetStart = target;
				285	return result;
				286	}
				287
				288	/* --------------------------------------------------------------------- */
				289
				290	ConversionResult ConvertUTF32toUTF8 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	291	const UTF32** sourceStart, const UTF32* sourceEnd,
				292	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	293	ConversionResult result = conversionOK;
				294	const UTF32* source = *sourceStart;
				295	UTF8* target = *targetStart;
				296	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	297	UTF32 ch;
				298	unsigned short bytesToWrite = 0;
				299	const UTF32 byteMask = 0xBF;
				300	const UTF32 byteMark = 0x80;
				301	ch = *source++;
				302	if (flags == strictConversion ) {
				303	/* UTF-16 surrogate values are illegal in UTF-32 */
				304	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				305	--source; /* return to the illegal value itself */
				306	result = sourceIllegal;
				307	break;
				308	}
				309	}
				310	/*
				311	* Figure out how many bytes the result will require. Turn any
				312	* illegally large UTF32 things (> Plane 17) into replacement chars.
				313	*/
				314	if (ch < (UTF32)0x80) { bytesToWrite = 1;
				315	} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
				316	} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
				317	} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
				318	} else { bytesToWrite = 3;
				319	ch = UNI_REPLACEMENT_CHAR;
				320	result = sourceIllegal;
				321	}
				322
				323	target += bytesToWrite;
				324	if (target > targetEnd) {
				325	--source; /* Back up source pointer! */
				326	target -= bytesToWrite; result = targetExhausted; break;
				327	}
				328	switch (bytesToWrite) { /* note: everything falls through. */
				329	case 4: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				330	case 3: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				331	case 2: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				332	case 1: *--target = (UTF8) (ch \| firstByteMark[bytesToWrite]);
				333	}
				334	target += bytesToWrite;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	335	}
				336	*sourceStart = source;
				337	*targetStart = target;
				338	return result;
				339	}
				340
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	341	/* --------------------------------------------------------------------- */
				342
				343	/*
				344	* Utility routine to tell whether a sequence of bytes is legal UTF-8.
				345	* This must be called with the length pre-determined by the first byte.
				346	* If not calling this from ConvertUTF8to*, then the length can be set by:
				347	* length = trailingBytesForUTF8[*source]+1;
				348	* and the sequence is illegal right away if there aren't that many bytes
				349	* available.
				350	* If presented with a length > 4, this returns false. The Unicode
				351	* definition of UTF-8 goes up to 4-byte sequences.
				352	*/
				353
				354	static Boolean isLegalUTF8(const UTF8 *source, int length) {
				355	UTF8 a;
				356	const UTF8 *srcptr = source+length;
				357	switch (length) {
				358	default: return false;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	359	/* Everything else falls through when "true"... */
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	360	case 4: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				361	case 3: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				362	case 2: if ((a = (*--srcptr)) > 0xBF) return false;
				363
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	364	switch (*source) {
				365	/* no fall-through in this inner switch */
				366	case 0xE0: if (a < 0xA0) return false; break;
				367	case 0xED: if (a > 0x9F) return false; break;
				368	case 0xF0: if (a < 0x90) return false; break;
				369	case 0xF4: if (a > 0x8F) return false; break;
				370	default: if (a < 0x80) return false;
				371	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	372
				373	case 1: if (source >= 0x80 && source < 0xC2) return false;
				374	}
				375	if (*source > 0xF4) return false;
				376	return true;
				377	}
				378
				379	/* --------------------------------------------------------------------- */
				380
				381	/*
				382	* Exported function to return whether a UTF-8 sequence is legal or not.
				383	* This is not used here; it's just exported.
				384	*/
				385	Boolean isLegalUTF8Sequence(const UTF8 source, const UTF8 sourceEnd) {
				386	int length = trailingBytesForUTF8[*source]+1;
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	387	if (length > sourceEnd - source) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	388	return false;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	389	}
				390	return isLegalUTF8(source, length);
				391	}
				392
				393	/* --------------------------------------------------------------------- */
				394
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	395	/*
				396	* Exported function to return whether a UTF-8 string is legal or not.
				397	* This is not used here; it's just exported.
				398	*/
				399	Boolean isLegalUTF8String(const UTF8 source, const UTF8 sourceEnd) {
				400	while (source != sourceEnd) {
				401	int length = trailingBytesForUTF8[*source] + 1;
				402	if (length > sourceEnd - source \|\| !isLegalUTF8(source, length))
				403	return false;
				404	source += length;
				405	}
				406	return true;
				407	}
				408
				409	/* --------------------------------------------------------------------- */
				410
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	411	ConversionResult ConvertUTF8toUTF16 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	412	const UTF8** sourceStart, const UTF8* sourceEnd,
				413	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	414	ConversionResult result = conversionOK;
				415	const UTF8* source = *sourceStart;
				416	UTF16* target = *targetStart;
				417	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	418	UTF32 ch = 0;
				419	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	420	if (extraBytesToRead >= sourceEnd - source) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	421	result = sourceExhausted; break;
				422	}
				423	/* Do this check whether lenient or strict */
				424	if (!isLegalUTF8(source, extraBytesToRead+1)) {
				425	result = sourceIllegal;
				426	break;
				427	}
				428	/*
				429	* The cases all fall through. See "Note A" below.
				430	*/
				431	switch (extraBytesToRead) {
				432	case 5: ch += source++; ch <<= 6; / remember, illegal UTF-8 */
				433	case 4: ch += source++; ch <<= 6; / remember, illegal UTF-8 */
				434	case 3: ch += *source++; ch <<= 6;
				435	case 2: ch += *source++; ch <<= 6;
				436	case 1: ch += *source++; ch <<= 6;
				437	case 0: ch += *source++;
				438	}
				439	ch -= offsetsFromUTF8[extraBytesToRead];
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	440
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	441	if (target >= targetEnd) {
				442	source -= (extraBytesToRead+1); /* Back up source pointer! */
				443	result = targetExhausted; break;
				444	}
				445	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				446	/* UTF-16 surrogate values are illegal in UTF-32 */
				447	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				448	if (flags == strictConversion) {
				449	source -= (extraBytesToRead+1); /* return to the illegal value itself */
				450	result = sourceIllegal;
				451	break;
				452	} else {
				453	*target++ = UNI_REPLACEMENT_CHAR;
				454	}
				455	} else {
				456	target++ = (UTF16)ch; / normal case */
				457	}
				458	} else if (ch > UNI_MAX_UTF16) {
				459	if (flags == strictConversion) {
				460	result = sourceIllegal;
				461	source -= (extraBytesToRead+1); /* return to the start */
				462	break; /* Bail out; shouldn't continue */
				463	} else {
				464	*target++ = UNI_REPLACEMENT_CHAR;
				465	}
				466	} else {
				467	/* target is a character in range 0xFFFF - 0x10FFFF. */
				468	if (target + 1 >= targetEnd) {
				469	source -= (extraBytesToRead+1); /* Back up source pointer! */
				470	result = targetExhausted; break;
				471	}
				472	ch -= halfBase;
				473	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
				474	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
				475	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	476	}
				477	*sourceStart = source;
				478	*targetStart = target;
				479	return result;
				480	}
				481
Eli Friedman	436ecd9	2011-11-01 02:10:54 +0000	[diff] [blame]	482	/* --------------------------------------------------------------------- */
				483
				484	ConversionResult ConvertUTF8toUTF32 (
				485	const UTF8** sourceStart, const UTF8* sourceEnd,
				486	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
				487	ConversionResult result = conversionOK;
				488	const UTF8* source = *sourceStart;
				489	UTF32* target = *targetStart;
				490	while (source < sourceEnd) {
				491	UTF32 ch = 0;
				492	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	493	if (extraBytesToRead >= sourceEnd - source) {
Eli Friedman	436ecd9	2011-11-01 02:10:54 +0000	[diff] [blame]	494	result = sourceExhausted; break;
				495	}
				496	/* Do this check whether lenient or strict */
				497	if (!isLegalUTF8(source, extraBytesToRead+1)) {
				498	result = sourceIllegal;
				499	break;
				500	}
				501	/*
				502	* The cases all fall through. See "Note A" below.
				503	*/
				504	switch (extraBytesToRead) {
				505	case 5: ch += *source++; ch <<= 6;
				506	case 4: ch += *source++; ch <<= 6;
				507	case 3: ch += *source++; ch <<= 6;
				508	case 2: ch += *source++; ch <<= 6;
				509	case 1: ch += *source++; ch <<= 6;
				510	case 0: ch += *source++;
				511	}
				512	ch -= offsetsFromUTF8[extraBytesToRead];
				513
				514	if (target >= targetEnd) {
				515	source -= (extraBytesToRead+1); /* Back up the source pointer! */
				516	result = targetExhausted; break;
				517	}
				518	if (ch <= UNI_MAX_LEGAL_UTF32) {
				519	/*
				520	* UTF-16 surrogate values are illegal in UTF-32, and anything
				521	* over Plane 17 (> 0x10FFFF) is illegal.
				522	*/
				523	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				524	if (flags == strictConversion) {
				525	source -= (extraBytesToRead+1); /* return to the illegal value itself */
				526	result = sourceIllegal;
				527	break;
				528	} else {
				529	*target++ = UNI_REPLACEMENT_CHAR;
				530	}
				531	} else {
				532	*target++ = ch;
				533	}
				534	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
				535	result = sourceIllegal;
				536	*target++ = UNI_REPLACEMENT_CHAR;
				537	}
				538	}
				539	*sourceStart = source;
				540	*targetStart = target;
				541	return result;
				542	}
				543
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	544	/* ---------------------------------------------------------------------
				545
				546	Note A.
				547	The fall-through switches in UTF-8 reading code save a
				548	temp variable, some decrements & conditionals. The switches
				549	are equivalent to the following loop:
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	550	{
				551	int tmpBytesToRead = extraBytesToRead+1;
				552	do {
				553	ch += *source++;
				554	--tmpBytesToRead;
				555	if (tmpBytesToRead) ch <<= 6;
				556	} while (tmpBytesToRead > 0);
				557	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	558	In UTF-8 writing code, the switches on "bytesToWrite" are
				559	similarly unrolled loops.
				560
				561	--------------------------------------------------------------------- */