Blame - lib/Basic/ConvertUTF.c - fp2-dev/platform/external/clang

blob: d16965ddd872d47845a3cf73bdd75090fc42fd94 [file] [log] [blame]

Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	1	/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
				2	*
				3	* The LLVM Compiler Infrastructure
				4	*
				5	* This file is distributed under the University of Illinois Open Source
				6	* License. See LICENSE.TXT for details.
				7	*
				8	===------------------------------------------------------------------------=/
				9	/*
				10	* Copyright 2001-2004 Unicode, Inc.
				11	*
				12	* Disclaimer
				13	*
				14	* This source code is provided as is by Unicode, Inc. No claims are
				15	* made as to fitness for any particular purpose. No warranties of any
				16	* kind are expressed or implied. The recipient agrees to determine
				17	* applicability of information provided. If this file has been
				18	* purchased on magnetic or optical media from Unicode, Inc., the
				19	* sole remedy for any claim will be exchange of defective media
				20	* within 90 days of receipt.
				21	*
				22	* Limitations on Rights to Redistribute This Code
				23	*
				24	* Unicode, Inc. hereby grants the right to freely use the information
				25	* supplied in this file in the creation of products supporting the
				26	* Unicode Standard, and to make copies of this file in any form
				27	* for internal or external distribution as long as this notice
				28	* remains attached.
				29	*/
				30
				31	/* ---------------------------------------------------------------------
				32
				33	Conversions between UTF32, UTF-16, and UTF-8. Source code file.
				34	Author: Mark E. Davis, 1994.
				35	Rev History: Rick McGowan, fixes & updates May 2001.
				36	Sept 2001: fixed const & error conditions per
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	37	mods suggested by S. Parent & A. Lillich.
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	38	June 2002: Tim Dodd added detection and handling of incomplete
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	39	source sequences, enhanced error detection, added casts
				40	to eliminate compiler warnings.
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	41	July 2003: slight mods to back out aggressive FFFE detection.
				42	Jan 2004: updated switches in from-UTF8 conversions.
				43	Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
				44
				45	See the header file "ConvertUTF.h" for complete documentation.
				46
				47	------------------------------------------------------------------------ */
				48
				49
				50	#include "clang/Basic/ConvertUTF.h"
				51	#ifdef CVTUTF_DEBUG
				52	#include <stdio.h>
				53	#endif
				54
				55	static const int halfShift = 10; /* used for shifting by 10 bits */
				56
				57	static const UTF32 halfBase = 0x0010000UL;
				58	static const UTF32 halfMask = 0x3FFUL;
				59
				60	#define UNI_SUR_HIGH_START (UTF32)0xD800
				61	#define UNI_SUR_HIGH_END (UTF32)0xDBFF
				62	#define UNI_SUR_LOW_START (UTF32)0xDC00
				63	#define UNI_SUR_LOW_END (UTF32)0xDFFF
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	64	#define false 0
				65	#define true 1
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	66
				67	/* --------------------------------------------------------------------- */
				68
				69	/*
				70	* Index into the table below with the first byte of a UTF-8 sequence to
				71	* get the number of trailing bytes that are supposed to follow it.
				72	* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
				73	* left as-is for anyone who may want to do such conversion, which was
				74	* allowed in earlier algorithms.
				75	*/
				76	static const char trailingBytesForUTF8[256] = {
				77	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				78	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				79	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				80	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				81	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				82	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				83	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				84	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
				85	};
				86
				87	/*
				88	* Magic values subtracted from a buffer value during UTF8 conversion.
				89	* This table contains as many values as there might be trailing bytes
				90	* in a UTF-8 sequence.
				91	*/
				92	static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	93	0x03C82080UL, 0xFA082080UL, 0x82082080UL };
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	94
				95	/*
				96	* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
				97	* into the first byte, depending on how many bytes follow. There are
				98	* as many entries in this table as there are UTF-8 sequence types.
				99	* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
				100	* for legal UTF-8 will be 4 or fewer bytes total.
				101	*/
				102	static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				103
				104	/* --------------------------------------------------------------------- */
				105
				106	/* The interface converts a whole buffer to avoid function-call overhead.
				107	* Constants have been gathered. Loops & conditionals have been removed as
				108	* much as possible for efficiency, in favor of drop-through switches.
				109	* (See "Note A" at the bottom of the file for equivalent code.)
				110	* If your compiler supports it, the "isLegalUTF8" call can be turned
				111	* into an inline function.
				112	*/
				113
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	114
				115	/* --------------------------------------------------------------------- */
				116
				117	ConversionResult ConvertUTF32toUTF16 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	118	const UTF32** sourceStart, const UTF32* sourceEnd,
				119	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	120	ConversionResult result = conversionOK;
				121	const UTF32* source = *sourceStart;
				122	UTF16* target = *targetStart;
				123	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	124	UTF32 ch;
				125	if (target >= targetEnd) {
				126	result = targetExhausted; break;
				127	}
				128	ch = *source++;
				129	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				130	/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
				131	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				132	if (flags == strictConversion) {
				133	--source; /* return to the illegal value itself */
				134	result = sourceIllegal;
				135	break;
				136	} else {
				137	*target++ = UNI_REPLACEMENT_CHAR;
				138	}
				139	} else {
				140	target++ = (UTF16)ch; / normal case */
				141	}
				142	} else if (ch > UNI_MAX_LEGAL_UTF32) {
				143	if (flags == strictConversion) {
				144	result = sourceIllegal;
				145	} else {
				146	*target++ = UNI_REPLACEMENT_CHAR;
				147	}
				148	} else {
				149	/* target is a character in range 0xFFFF - 0x10FFFF. */
				150	if (target + 1 >= targetEnd) {
				151	--source; /* Back up source pointer! */
				152	result = targetExhausted; break;
				153	}
				154	ch -= halfBase;
				155	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
				156	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
				157	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	158	}
				159	*sourceStart = source;
				160	*targetStart = target;
				161	return result;
				162	}
				163
				164	/* --------------------------------------------------------------------- */
				165
				166	ConversionResult ConvertUTF16toUTF32 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	167	const UTF16** sourceStart, const UTF16* sourceEnd,
				168	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	169	ConversionResult result = conversionOK;
				170	const UTF16* source = *sourceStart;
				171	UTF32* target = *targetStart;
				172	UTF32 ch, ch2;
				173	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	174	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
				175	ch = *source++;
				176	/* If we have a surrogate pair, convert to UTF32 first. */
				177	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				178	/* If the 16 bits following the high surrogate are in the source buffer... */
				179	if (source < sourceEnd) {
				180	ch2 = *source;
				181	/* If it's a low surrogate, convert to UTF32. */
				182	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				183	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				184	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				185	++source;
				186	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				187	--source; /* return to the illegal value itself */
				188	result = sourceIllegal;
				189	break;
				190	}
				191	} else { /* We don't have the 16 bits following the high surrogate. */
				192	--source; /* return to the high surrogate */
				193	result = sourceExhausted;
				194	break;
				195	}
				196	} else if (flags == strictConversion) {
				197	/* UTF-16 surrogate values are illegal in UTF-32 */
				198	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				199	--source; /* return to the illegal value itself */
				200	result = sourceIllegal;
				201	break;
				202	}
				203	}
				204	if (target >= targetEnd) {
				205	source = oldSource; /* Back up source pointer! */
				206	result = targetExhausted; break;
				207	}
				208	*target++ = ch;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	209	}
				210	*sourceStart = source;
				211	*targetStart = target;
				212	#ifdef CVTUTF_DEBUG
				213	if (result == sourceIllegal) {
				214	fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
				215	fflush(stderr);
				216	}
				217	#endif
				218	return result;
				219	}
				220	ConversionResult ConvertUTF16toUTF8 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	221	const UTF16** sourceStart, const UTF16* sourceEnd,
				222	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	223	ConversionResult result = conversionOK;
				224	const UTF16* source = *sourceStart;
				225	UTF8* target = *targetStart;
				226	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	227	UTF32 ch;
				228	unsigned short bytesToWrite = 0;
				229	const UTF32 byteMask = 0xBF;
				230	const UTF32 byteMark = 0x80;
				231	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
				232	ch = *source++;
				233	/* If we have a surrogate pair, convert to UTF32 first. */
				234	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				235	/* If the 16 bits following the high surrogate are in the source buffer... */
				236	if (source < sourceEnd) {
				237	UTF32 ch2 = *source;
				238	/* If it's a low surrogate, convert to UTF32. */
				239	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				240	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				241	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				242	++source;
				243	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				244	--source; /* return to the illegal value itself */
				245	result = sourceIllegal;
				246	break;
				247	}
				248	} else { /* We don't have the 16 bits following the high surrogate. */
				249	--source; /* return to the high surrogate */
				250	result = sourceExhausted;
				251	break;
				252	}
				253	} else if (flags == strictConversion) {
				254	/* UTF-16 surrogate values are illegal in UTF-32 */
				255	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				256	--source; /* return to the illegal value itself */
				257	result = sourceIllegal;
				258	break;
				259	}
				260	}
				261	/* Figure out how many bytes the result will require */
				262	if (ch < (UTF32)0x80) { bytesToWrite = 1;
				263	} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
				264	} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
				265	} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
				266	} else { bytesToWrite = 3;
				267	ch = UNI_REPLACEMENT_CHAR;
				268	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	269
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	270	target += bytesToWrite;
				271	if (target > targetEnd) {
				272	source = oldSource; /* Back up source pointer! */
				273	target -= bytesToWrite; result = targetExhausted; break;
				274	}
				275	switch (bytesToWrite) { /* note: everything falls through. */
				276	case 4: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				277	case 3: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				278	case 2: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				279	case 1: *--target = (UTF8)(ch \| firstByteMark[bytesToWrite]);
				280	}
				281	target += bytesToWrite;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	282	}
				283	*sourceStart = source;
				284	*targetStart = target;
				285	return result;
				286	}
				287
				288	/* --------------------------------------------------------------------- */
				289
				290	ConversionResult ConvertUTF32toUTF8 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	291	const UTF32** sourceStart, const UTF32* sourceEnd,
				292	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	293	ConversionResult result = conversionOK;
				294	const UTF32* source = *sourceStart;
				295	UTF8* target = *targetStart;
				296	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	297	UTF32 ch;
				298	unsigned short bytesToWrite = 0;
				299	const UTF32 byteMask = 0xBF;
				300	const UTF32 byteMark = 0x80;
				301	ch = *source++;
				302	if (flags == strictConversion ) {
				303	/* UTF-16 surrogate values are illegal in UTF-32 */
				304	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				305	--source; /* return to the illegal value itself */
				306	result = sourceIllegal;
				307	break;
				308	}
				309	}
				310	/*
				311	* Figure out how many bytes the result will require. Turn any
				312	* illegally large UTF32 things (> Plane 17) into replacement chars.
				313	*/
				314	if (ch < (UTF32)0x80) { bytesToWrite = 1;
				315	} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
				316	} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
				317	} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
				318	} else { bytesToWrite = 3;
				319	ch = UNI_REPLACEMENT_CHAR;
				320	result = sourceIllegal;
				321	}
				322
				323	target += bytesToWrite;
				324	if (target > targetEnd) {
				325	--source; /* Back up source pointer! */
				326	target -= bytesToWrite; result = targetExhausted; break;
				327	}
				328	switch (bytesToWrite) { /* note: everything falls through. */
				329	case 4: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				330	case 3: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				331	case 2: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				332	case 1: *--target = (UTF8) (ch \| firstByteMark[bytesToWrite]);
				333	}
				334	target += bytesToWrite;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	335	}
				336	*sourceStart = source;
				337	*targetStart = target;
				338	return result;
				339	}
				340
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	341	/* --------------------------------------------------------------------- */
				342
				343	/*
				344	* Utility routine to tell whether a sequence of bytes is legal UTF-8.
				345	* This must be called with the length pre-determined by the first byte.
				346	* If not calling this from ConvertUTF8to*, then the length can be set by:
				347	* length = trailingBytesForUTF8[*source]+1;
				348	* and the sequence is illegal right away if there aren't that many bytes
				349	* available.
				350	* If presented with a length > 4, this returns false. The Unicode
				351	* definition of UTF-8 goes up to 4-byte sequences.
				352	*/
				353
				354	static Boolean isLegalUTF8(const UTF8 *source, int length) {
				355	UTF8 a;
				356	const UTF8 *srcptr = source+length;
				357	switch (length) {
				358	default: return false;
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	359	/* Everything else falls through when "true"... */
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	360	case 4: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				361	case 3: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
Seth Cantrell	31ba273	2012-10-30 23:50:26 +0000	[diff] [blame]	362	case 2: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	363
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	364	switch (*source) {
				365	/* no fall-through in this inner switch */
				366	case 0xE0: if (a < 0xA0) return false; break;
				367	case 0xED: if (a > 0x9F) return false; break;
				368	case 0xF0: if (a < 0x90) return false; break;
				369	case 0xF4: if (a > 0x8F) return false; break;
				370	default: if (a < 0x80) return false;
				371	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	372
				373	case 1: if (source >= 0x80 && source < 0xC2) return false;
				374	}
				375	if (*source > 0xF4) return false;
				376	return true;
				377	}
				378
				379	/* --------------------------------------------------------------------- */
				380
				381	/*
				382	* Exported function to return whether a UTF-8 sequence is legal or not.
				383	* This is not used here; it's just exported.
				384	*/
				385	Boolean isLegalUTF8Sequence(const UTF8 source, const UTF8 sourceEnd) {
				386	int length = trailingBytesForUTF8[*source]+1;
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	387	if (length > sourceEnd - source) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	388	return false;
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	389	}
				390	return isLegalUTF8(source, length);
				391	}
				392
				393	/* --------------------------------------------------------------------- */
				394
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	395	/*
Richard Smith	e5f0588	2012-09-08 07:16:20 +0000	[diff] [blame]	396	* Exported function to return the total number of bytes in a codepoint
				397	* represented in UTF-8, given the value of the first byte.
				398	*/
				399	unsigned getNumBytesForUTF8(UTF8 first) {
				400	return trailingBytesForUTF8[first] + 1;
				401	}
				402
				403	/* --------------------------------------------------------------------- */
				404
				405	/*
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	406	* Exported function to return whether a UTF-8 string is legal or not.
				407	* This is not used here; it's just exported.
				408	*/
Richard Smith	e5f0588	2012-09-08 07:16:20 +0000	[diff] [blame]	409	Boolean isLegalUTF8String(const UTF8 *source, const UTF8 sourceEnd) {
				410	while (*source != sourceEnd) {
				411	int length = trailingBytesForUTF8[**source] + 1;
				412	if (length > sourceEnd - source \|\| !isLegalUTF8(source, length))
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	413	return false;
Richard Smith	e5f0588	2012-09-08 07:16:20 +0000	[diff] [blame]	414	*source += length;
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	415	}
				416	return true;
				417	}
				418
				419	/* --------------------------------------------------------------------- */
				420
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	421	ConversionResult ConvertUTF8toUTF16 (
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	422	const UTF8** sourceStart, const UTF8* sourceEnd,
				423	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	424	ConversionResult result = conversionOK;
				425	const UTF8* source = *sourceStart;
				426	UTF16* target = *targetStart;
				427	while (source < sourceEnd) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	428	UTF32 ch = 0;
				429	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	430	if (extraBytesToRead >= sourceEnd - source) {
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	431	result = sourceExhausted; break;
				432	}
				433	/* Do this check whether lenient or strict */
				434	if (!isLegalUTF8(source, extraBytesToRead+1)) {
				435	result = sourceIllegal;
				436	break;
				437	}
				438	/*
				439	* The cases all fall through. See "Note A" below.
				440	*/
				441	switch (extraBytesToRead) {
				442	case 5: ch += source++; ch <<= 6; / remember, illegal UTF-8 */
				443	case 4: ch += source++; ch <<= 6; / remember, illegal UTF-8 */
				444	case 3: ch += *source++; ch <<= 6;
				445	case 2: ch += *source++; ch <<= 6;
				446	case 1: ch += *source++; ch <<= 6;
				447	case 0: ch += *source++;
				448	}
				449	ch -= offsetsFromUTF8[extraBytesToRead];
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	450
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	451	if (target >= targetEnd) {
				452	source -= (extraBytesToRead+1); /* Back up source pointer! */
				453	result = targetExhausted; break;
				454	}
				455	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				456	/* UTF-16 surrogate values are illegal in UTF-32 */
				457	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				458	if (flags == strictConversion) {
				459	source -= (extraBytesToRead+1); /* return to the illegal value itself */
				460	result = sourceIllegal;
				461	break;
				462	} else {
				463	*target++ = UNI_REPLACEMENT_CHAR;
				464	}
				465	} else {
				466	target++ = (UTF16)ch; / normal case */
				467	}
				468	} else if (ch > UNI_MAX_UTF16) {
				469	if (flags == strictConversion) {
				470	result = sourceIllegal;
				471	source -= (extraBytesToRead+1); /* return to the start */
				472	break; /* Bail out; shouldn't continue */
				473	} else {
				474	*target++ = UNI_REPLACEMENT_CHAR;
				475	}
				476	} else {
				477	/* target is a character in range 0xFFFF - 0x10FFFF. */
				478	if (target + 1 >= targetEnd) {
				479	source -= (extraBytesToRead+1); /* Back up source pointer! */
				480	result = targetExhausted; break;
				481	}
				482	ch -= halfBase;
				483	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
				484	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
				485	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	486	}
				487	*sourceStart = source;
				488	*targetStart = target;
				489	return result;
				490	}
				491
Eli Friedman	436ecd9	2011-11-01 02:10:54 +0000	[diff] [blame]	492	/* --------------------------------------------------------------------- */
				493
				494	ConversionResult ConvertUTF8toUTF32 (
				495	const UTF8** sourceStart, const UTF8* sourceEnd,
				496	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
				497	ConversionResult result = conversionOK;
				498	const UTF8* source = *sourceStart;
				499	UTF32* target = *targetStart;
				500	while (source < sourceEnd) {
				501	UTF32 ch = 0;
				502	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
Richard Smith	49d5174	2012-03-08 21:59:28 +0000	[diff] [blame]	503	if (extraBytesToRead >= sourceEnd - source) {
Eli Friedman	436ecd9	2011-11-01 02:10:54 +0000	[diff] [blame]	504	result = sourceExhausted; break;
				505	}
				506	/* Do this check whether lenient or strict */
				507	if (!isLegalUTF8(source, extraBytesToRead+1)) {
				508	result = sourceIllegal;
				509	break;
				510	}
				511	/*
				512	* The cases all fall through. See "Note A" below.
				513	*/
				514	switch (extraBytesToRead) {
				515	case 5: ch += *source++; ch <<= 6;
				516	case 4: ch += *source++; ch <<= 6;
				517	case 3: ch += *source++; ch <<= 6;
				518	case 2: ch += *source++; ch <<= 6;
				519	case 1: ch += *source++; ch <<= 6;
				520	case 0: ch += *source++;
				521	}
				522	ch -= offsetsFromUTF8[extraBytesToRead];
				523
				524	if (target >= targetEnd) {
				525	source -= (extraBytesToRead+1); /* Back up the source pointer! */
				526	result = targetExhausted; break;
				527	}
				528	if (ch <= UNI_MAX_LEGAL_UTF32) {
				529	/*
				530	* UTF-16 surrogate values are illegal in UTF-32, and anything
				531	* over Plane 17 (> 0x10FFFF) is illegal.
				532	*/
				533	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				534	if (flags == strictConversion) {
				535	source -= (extraBytesToRead+1); /* return to the illegal value itself */
				536	result = sourceIllegal;
				537	break;
				538	} else {
				539	*target++ = UNI_REPLACEMENT_CHAR;
				540	}
				541	} else {
				542	*target++ = ch;
				543	}
				544	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
				545	result = sourceIllegal;
				546	*target++ = UNI_REPLACEMENT_CHAR;
				547	}
				548	}
				549	*sourceStart = source;
				550	*targetStart = target;
				551	return result;
				552	}
				553
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	554	/* ---------------------------------------------------------------------
				555
				556	Note A.
				557	The fall-through switches in UTF-8 reading code save a
				558	temp variable, some decrements & conditionals. The switches
				559	are equivalent to the following loop:
Mike Stump	1eb4433	2009-09-09 15:08:12 +0000	[diff] [blame]	560	{
				561	int tmpBytesToRead = extraBytesToRead+1;
				562	do {
				563	ch += *source++;
				564	--tmpBytesToRead;
				565	if (tmpBytesToRead) ch <<= 6;
				566	} while (tmpBytesToRead > 0);
				567	}
Steve Naroff	e9b7d8a	2009-04-01 15:50:34 +0000	[diff] [blame]	568	In UTF-8 writing code, the switches on "bytesToWrite" are
				569	similarly unrolled loops.
				570
				571	--------------------------------------------------------------------- */