Blame - llvm/lib/Support/ConvertUTF.cpp - toolchain/llvm-project

blob: aa9507c189eda0b7f2fb5c8b3b98156f4c351450 [file] [log] [blame]

Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	1	/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
				2	*
				3	* The LLVM Compiler Infrastructure
				4	*
				5	* This file is distributed under the University of Illinois Open Source
				6	* License. See LICENSE.TXT for details.
				7	*
				8	===------------------------------------------------------------------------=/
				9	/*
				10	* Copyright 2001-2004 Unicode, Inc.
				11	*
				12	* Disclaimer
				13	*
				14	* This source code is provided as is by Unicode, Inc. No claims are
				15	* made as to fitness for any particular purpose. No warranties of any
				16	* kind are expressed or implied. The recipient agrees to determine
				17	* applicability of information provided. If this file has been
				18	* purchased on magnetic or optical media from Unicode, Inc., the
				19	* sole remedy for any claim will be exchange of defective media
				20	* within 90 days of receipt.
				21	*
				22	* Limitations on Rights to Redistribute This Code
				23	*
				24	* Unicode, Inc. hereby grants the right to freely use the information
				25	* supplied in this file in the creation of products supporting the
				26	* Unicode Standard, and to make copies of this file in any form
				27	* for internal or external distribution as long as this notice
				28	* remains attached.
				29	*/
				30
				31	/* ---------------------------------------------------------------------
				32
				33	Conversions between UTF32, UTF-16, and UTF-8. Source code file.
				34	Author: Mark E. Davis, 1994.
				35	Rev History: Rick McGowan, fixes & updates May 2001.
				36	Sept 2001: fixed const & error conditions per
				37	mods suggested by S. Parent & A. Lillich.
				38	June 2002: Tim Dodd added detection and handling of incomplete
				39	source sequences, enhanced error detection, added casts
				40	to eliminate compiler warnings.
				41	July 2003: slight mods to back out aggressive FFFE detection.
				42	Jan 2004: updated switches in from-UTF8 conversions.
				43	Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
				44
				45	See the header file "ConvertUTF.h" for complete documentation.
				46
				47	------------------------------------------------------------------------ */
				48
				49
				50	#include "llvm/Support/ConvertUTF.h"
				51	#ifdef CVTUTF_DEBUG
				52	#include <stdio.h>
				53	#endif
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	54	#include <assert.h>
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	55
Galina Kistanova	229c9c1	2017-05-29 01:34:26 +0000	[diff] [blame]	56
				57	/*
				58	* This code extensively uses fall-through switches.
				59	* Keep the compiler from warning about that.
				60	*/
				61	#if defined(__clang__) && defined(__has_warning)
				62	# if __has_warning("-Wimplicit-fallthrough")
				63	# define ConvertUTF_DISABLE_WARNINGS \
				64	_Pragma("clang diagnostic push") \
				65	_Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
				66	# define ConvertUTF_RESTORE_WARNINGS \
				67	_Pragma("clang diagnostic pop")
				68	# endif
				69	#elif defined(__GNUC__) && __GNUC__ > 6
				70	# define ConvertUTF_DISABLE_WARNINGS \
				71	_Pragma("GCC diagnostic push") \
				72	_Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
				73	# define ConvertUTF_RESTORE_WARNINGS \
				74	_Pragma("GCC diagnostic pop")
				75	#endif
				76	#ifndef ConvertUTF_DISABLE_WARNINGS
				77	# define ConvertUTF_DISABLE_WARNINGS
				78	#endif
				79	#ifndef ConvertUTF_RESTORE_WARNINGS
				80	# define ConvertUTF_RESTORE_WARNINGS
				81	#endif
				82
				83	ConvertUTF_DISABLE_WARNINGS
				84
Justin Lebar	9091055	2016-09-30 00:38:45 +0000	[diff] [blame]	85	namespace llvm {
				86
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	87	static const int halfShift = 10; /* used for shifting by 10 bits */
				88
				89	static const UTF32 halfBase = 0x0010000UL;
				90	static const UTF32 halfMask = 0x3FFUL;
				91
				92	#define UNI_SUR_HIGH_START (UTF32)0xD800
				93	#define UNI_SUR_HIGH_END (UTF32)0xDBFF
				94	#define UNI_SUR_LOW_START (UTF32)0xDC00
				95	#define UNI_SUR_LOW_END (UTF32)0xDFFF
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	96
				97	/* --------------------------------------------------------------------- */
				98
				99	/*
				100	* Index into the table below with the first byte of a UTF-8 sequence to
				101	* get the number of trailing bytes that are supposed to follow it.
				102	* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
				103	* left as-is for anyone who may want to do such conversion, which was
				104	* allowed in earlier algorithms.
				105	*/
				106	static const char trailingBytesForUTF8[256] = {
				107	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				108	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				109	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				110	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				111	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				112	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
				113	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
				114	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
				115	};
				116
				117	/*
				118	* Magic values subtracted from a buffer value during UTF8 conversion.
				119	* This table contains as many values as there might be trailing bytes
				120	* in a UTF-8 sequence.
				121	*/
				122	static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
				123	0x03C82080UL, 0xFA082080UL, 0x82082080UL };
				124
				125	/*
				126	* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
				127	* into the first byte, depending on how many bytes follow. There are
				128	* as many entries in this table as there are UTF-8 sequence types.
				129	* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
				130	* for legal UTF-8 will be 4 or fewer bytes total.
				131	*/
				132	static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
				133
				134	/* --------------------------------------------------------------------- */
				135
				136	/* The interface converts a whole buffer to avoid function-call overhead.
				137	* Constants have been gathered. Loops & conditionals have been removed as
				138	* much as possible for efficiency, in favor of drop-through switches.
				139	* (See "Note A" at the bottom of the file for equivalent code.)
				140	* If your compiler supports it, the "isLegalUTF8" call can be turned
				141	* into an inline function.
				142	*/
				143
				144
				145	/* --------------------------------------------------------------------- */
				146
				147	ConversionResult ConvertUTF32toUTF16 (
				148	const UTF32** sourceStart, const UTF32* sourceEnd,
				149	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
				150	ConversionResult result = conversionOK;
				151	const UTF32* source = *sourceStart;
				152	UTF16* target = *targetStart;
				153	while (source < sourceEnd) {
				154	UTF32 ch;
				155	if (target >= targetEnd) {
				156	result = targetExhausted; break;
				157	}
				158	ch = *source++;
				159	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				160	/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
				161	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				162	if (flags == strictConversion) {
				163	--source; /* return to the illegal value itself */
				164	result = sourceIllegal;
				165	break;
				166	} else {
				167	*target++ = UNI_REPLACEMENT_CHAR;
				168	}
				169	} else {
				170	target++ = (UTF16)ch; / normal case */
				171	}
				172	} else if (ch > UNI_MAX_LEGAL_UTF32) {
				173	if (flags == strictConversion) {
				174	result = sourceIllegal;
				175	} else {
				176	*target++ = UNI_REPLACEMENT_CHAR;
				177	}
				178	} else {
				179	/* target is a character in range 0xFFFF - 0x10FFFF. */
				180	if (target + 1 >= targetEnd) {
				181	--source; /* Back up source pointer! */
				182	result = targetExhausted; break;
				183	}
				184	ch -= halfBase;
				185	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
				186	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
				187	}
				188	}
				189	*sourceStart = source;
				190	*targetStart = target;
				191	return result;
				192	}
				193
				194	/* --------------------------------------------------------------------- */
				195
				196	ConversionResult ConvertUTF16toUTF32 (
				197	const UTF16** sourceStart, const UTF16* sourceEnd,
				198	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
				199	ConversionResult result = conversionOK;
				200	const UTF16* source = *sourceStart;
				201	UTF32* target = *targetStart;
				202	UTF32 ch, ch2;
				203	while (source < sourceEnd) {
				204	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
				205	ch = *source++;
				206	/* If we have a surrogate pair, convert to UTF32 first. */
				207	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				208	/* If the 16 bits following the high surrogate are in the source buffer... */
				209	if (source < sourceEnd) {
				210	ch2 = *source;
				211	/* If it's a low surrogate, convert to UTF32. */
				212	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				213	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				214	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				215	++source;
				216	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				217	--source; /* return to the illegal value itself */
				218	result = sourceIllegal;
				219	break;
				220	}
				221	} else { /* We don't have the 16 bits following the high surrogate. */
				222	--source; /* return to the high surrogate */
				223	result = sourceExhausted;
				224	break;
				225	}
				226	} else if (flags == strictConversion) {
				227	/* UTF-16 surrogate values are illegal in UTF-32 */
				228	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				229	--source; /* return to the illegal value itself */
				230	result = sourceIllegal;
				231	break;
				232	}
				233	}
				234	if (target >= targetEnd) {
				235	source = oldSource; /* Back up source pointer! */
				236	result = targetExhausted; break;
				237	}
				238	*target++ = ch;
				239	}
				240	*sourceStart = source;
				241	*targetStart = target;
				242	#ifdef CVTUTF_DEBUG
				243	if (result == sourceIllegal) {
				244	fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
				245	fflush(stderr);
				246	}
				247	#endif
				248	return result;
				249	}
				250	ConversionResult ConvertUTF16toUTF8 (
				251	const UTF16** sourceStart, const UTF16* sourceEnd,
				252	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
				253	ConversionResult result = conversionOK;
				254	const UTF16* source = *sourceStart;
				255	UTF8* target = *targetStart;
				256	while (source < sourceEnd) {
				257	UTF32 ch;
				258	unsigned short bytesToWrite = 0;
				259	const UTF32 byteMask = 0xBF;
				260	const UTF32 byteMark = 0x80;
				261	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
				262	ch = *source++;
				263	/* If we have a surrogate pair, convert to UTF32 first. */
				264	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				265	/* If the 16 bits following the high surrogate are in the source buffer... */
				266	if (source < sourceEnd) {
				267	UTF32 ch2 = *source;
				268	/* If it's a low surrogate, convert to UTF32. */
				269	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				270	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				271	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				272	++source;
				273	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				274	--source; /* return to the illegal value itself */
				275	result = sourceIllegal;
				276	break;
				277	}
				278	} else { /* We don't have the 16 bits following the high surrogate. */
				279	--source; /* return to the high surrogate */
				280	result = sourceExhausted;
				281	break;
				282	}
				283	} else if (flags == strictConversion) {
				284	/* UTF-16 surrogate values are illegal in UTF-32 */
				285	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				286	--source; /* return to the illegal value itself */
				287	result = sourceIllegal;
				288	break;
				289	}
				290	}
				291	/* Figure out how many bytes the result will require */
				292	if (ch < (UTF32)0x80) { bytesToWrite = 1;
				293	} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
				294	} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
				295	} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
				296	} else { bytesToWrite = 3;
				297	ch = UNI_REPLACEMENT_CHAR;
				298	}
				299
				300	target += bytesToWrite;
				301	if (target > targetEnd) {
				302	source = oldSource; /* Back up source pointer! */
				303	target -= bytesToWrite; result = targetExhausted; break;
				304	}
				305	switch (bytesToWrite) { /* note: everything falls through. */
				306	case 4: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				307	case 3: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				308	case 2: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				309	case 1: *--target = (UTF8)(ch \| firstByteMark[bytesToWrite]);
				310	}
				311	target += bytesToWrite;
				312	}
				313	*sourceStart = source;
				314	*targetStart = target;
				315	return result;
				316	}
				317
				318	/* --------------------------------------------------------------------- */
				319
				320	ConversionResult ConvertUTF32toUTF8 (
				321	const UTF32** sourceStart, const UTF32* sourceEnd,
				322	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
				323	ConversionResult result = conversionOK;
				324	const UTF32* source = *sourceStart;
				325	UTF8* target = *targetStart;
				326	while (source < sourceEnd) {
				327	UTF32 ch;
				328	unsigned short bytesToWrite = 0;
				329	const UTF32 byteMask = 0xBF;
				330	const UTF32 byteMark = 0x80;
				331	ch = *source++;
				332	if (flags == strictConversion ) {
				333	/* UTF-16 surrogate values are illegal in UTF-32 */
				334	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				335	--source; /* return to the illegal value itself */
				336	result = sourceIllegal;
				337	break;
				338	}
				339	}
				340	/*
				341	* Figure out how many bytes the result will require. Turn any
				342	* illegally large UTF32 things (> Plane 17) into replacement chars.
				343	*/
				344	if (ch < (UTF32)0x80) { bytesToWrite = 1;
				345	} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
				346	} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
				347	} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
				348	} else { bytesToWrite = 3;
				349	ch = UNI_REPLACEMENT_CHAR;
				350	result = sourceIllegal;
				351	}
				352
				353	target += bytesToWrite;
				354	if (target > targetEnd) {
				355	--source; /* Back up source pointer! */
				356	target -= bytesToWrite; result = targetExhausted; break;
				357	}
				358	switch (bytesToWrite) { /* note: everything falls through. */
				359	case 4: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				360	case 3: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				361	case 2: *--target = (UTF8)((ch \| byteMark) & byteMask); ch >>= 6;
				362	case 1: *--target = (UTF8) (ch \| firstByteMark[bytesToWrite]);
				363	}
				364	target += bytesToWrite;
				365	}
				366	*sourceStart = source;
				367	*targetStart = target;
				368	return result;
				369	}
				370
				371	/* --------------------------------------------------------------------- */
				372
				373	/*
				374	* Utility routine to tell whether a sequence of bytes is legal UTF-8.
				375	* This must be called with the length pre-determined by the first byte.
				376	* If not calling this from ConvertUTF8to*, then the length can be set by:
				377	* length = trailingBytesForUTF8[*source]+1;
				378	* and the sequence is illegal right away if there aren't that many bytes
				379	* available.
				380	* If presented with a length > 4, this returns false. The Unicode
				381	* definition of UTF-8 goes up to 4-byte sequences.
				382	*/
				383
				384	static Boolean isLegalUTF8(const UTF8 *source, int length) {
				385	UTF8 a;
				386	const UTF8 *srcptr = source+length;
				387	switch (length) {
				388	default: return false;
				389	/* Everything else falls through when "true"... */
				390	case 4: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				391	case 3: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				392	case 2: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				393
				394	switch (*source) {
				395	/* no fall-through in this inner switch */
				396	case 0xE0: if (a < 0xA0) return false; break;
				397	case 0xED: if (a > 0x9F) return false; break;
				398	case 0xF0: if (a < 0x90) return false; break;
				399	case 0xF4: if (a > 0x8F) return false; break;
				400	default: if (a < 0x80) return false;
				401	}
				402
				403	case 1: if (source >= 0x80 && source < 0xC2) return false;
				404	}
				405	if (*source > 0xF4) return false;
				406	return true;
				407	}
				408
				409	/* --------------------------------------------------------------------- */
				410
				411	/*
				412	* Exported function to return whether a UTF-8 sequence is legal or not.
				413	* This is not used here; it's just exported.
				414	*/
				415	Boolean isLegalUTF8Sequence(const UTF8 source, const UTF8 sourceEnd) {
				416	int length = trailingBytesForUTF8[*source]+1;
				417	if (length > sourceEnd - source) {
				418	return false;
				419	}
				420	return isLegalUTF8(source, length);
				421	}
				422
				423	/* --------------------------------------------------------------------- */
				424
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	425	static unsigned
				426	findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
				427	const UTF8 *sourceEnd) {
Dmitri Gribenko	caee8cb	2014-06-16 11:22:33 +0000	[diff] [blame]	428	UTF8 b1, b2, b3;
				429
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	430	assert(!isLegalUTF8Sequence(source, sourceEnd));
				431
				432	/*
				433	* Unicode 6.3.0, D93b:
				434	*
				435	* Maximal subpart of an ill-formed subsequence: The longest code unit
				436	* subsequence starting at an unconvertible offset that is either:
				437	* a. the initial subsequence of a well-formed code unit sequence, or
				438	* b. a subsequence of length one.
				439	*/
				440
				441	if (source == sourceEnd)
				442	return 0;
				443
				444	/*
				445	* Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
				446	* Byte Sequences.
				447	*/
				448
Dmitri Gribenko	caee8cb	2014-06-16 11:22:33 +0000	[diff] [blame]	449	b1 = *source;
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	450	++source;
				451	if (b1 >= 0xC2 && b1 <= 0xDF) {
				452	/*
				453	* First byte is valid, but we know that this code unit sequence is
				454	* invalid, so the maximal subpart has to end after the first byte.
				455	*/
				456	return 1;
				457	}
				458
				459	if (source == sourceEnd)
				460	return 1;
				461
Dmitri Gribenko	caee8cb	2014-06-16 11:22:33 +0000	[diff] [blame]	462	b2 = *source;
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	463	++source;
				464
				465	if (b1 == 0xE0) {
				466	return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
				467	}
				468	if (b1 >= 0xE1 && b1 <= 0xEC) {
				469	return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
				470	}
				471	if (b1 == 0xED) {
				472	return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
				473	}
				474	if (b1 >= 0xEE && b1 <= 0xEF) {
				475	return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
				476	}
				477	if (b1 == 0xF0) {
				478	if (b2 >= 0x90 && b2 <= 0xBF) {
				479	if (source == sourceEnd)
				480	return 2;
				481
Dmitri Gribenko	caee8cb	2014-06-16 11:22:33 +0000	[diff] [blame]	482	b3 = *source;
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	483	return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
				484	}
				485	return 1;
				486	}
				487	if (b1 >= 0xF1 && b1 <= 0xF3) {
				488	if (b2 >= 0x80 && b2 <= 0xBF) {
				489	if (source == sourceEnd)
				490	return 2;
				491
Dmitri Gribenko	caee8cb	2014-06-16 11:22:33 +0000	[diff] [blame]	492	b3 = *source;
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	493	return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
				494	}
				495	return 1;
				496	}
				497	if (b1 == 0xF4) {
				498	if (b2 >= 0x80 && b2 <= 0x8F) {
				499	if (source == sourceEnd)
				500	return 2;
				501
Dmitri Gribenko	caee8cb	2014-06-16 11:22:33 +0000	[diff] [blame]	502	b3 = *source;
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	503	return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
				504	}
				505	return 1;
				506	}
				507
				508	assert((b1 >= 0x80 && b1 <= 0xC1) \|\| b1 >= 0xF5);
				509	/*
				510	* There are no valid sequences that start with these bytes. Maximal subpart
				511	* is defined to have length 1 in these cases.
				512	*/
				513	return 1;
				514	}
				515
				516	/* --------------------------------------------------------------------- */
				517
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	518	/*
				519	* Exported function to return the total number of bytes in a codepoint
				520	* represented in UTF-8, given the value of the first byte.
				521	*/
				522	unsigned getNumBytesForUTF8(UTF8 first) {
				523	return trailingBytesForUTF8[first] + 1;
				524	}
				525
				526	/* --------------------------------------------------------------------- */
				527
				528	/*
				529	* Exported function to return whether a UTF-8 string is legal or not.
				530	* This is not used here; it's just exported.
				531	*/
				532	Boolean isLegalUTF8String(const UTF8 *source, const UTF8 sourceEnd) {
				533	while (*source != sourceEnd) {
				534	int length = trailingBytesForUTF8[**source] + 1;
				535	if (length > sourceEnd - source \|\| !isLegalUTF8(source, length))
				536	return false;
				537	*source += length;
				538	}
				539	return true;
				540	}
				541
				542	/* --------------------------------------------------------------------- */
				543
				544	ConversionResult ConvertUTF8toUTF16 (
				545	const UTF8** sourceStart, const UTF8* sourceEnd,
				546	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
				547	ConversionResult result = conversionOK;
				548	const UTF8* source = *sourceStart;
				549	UTF16* target = *targetStart;
				550	while (source < sourceEnd) {
				551	UTF32 ch = 0;
				552	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
				553	if (extraBytesToRead >= sourceEnd - source) {
				554	result = sourceExhausted; break;
				555	}
				556	/* Do this check whether lenient or strict */
				557	if (!isLegalUTF8(source, extraBytesToRead+1)) {
				558	result = sourceIllegal;
				559	break;
				560	}
				561	/*
				562	* The cases all fall through. See "Note A" below.
				563	*/
				564	switch (extraBytesToRead) {
				565	case 5: ch += source++; ch <<= 6; / remember, illegal UTF-8 */
				566	case 4: ch += source++; ch <<= 6; / remember, illegal UTF-8 */
				567	case 3: ch += *source++; ch <<= 6;
				568	case 2: ch += *source++; ch <<= 6;
				569	case 1: ch += *source++; ch <<= 6;
				570	case 0: ch += *source++;
				571	}
				572	ch -= offsetsFromUTF8[extraBytesToRead];
				573
				574	if (target >= targetEnd) {
				575	source -= (extraBytesToRead+1); /* Back up source pointer! */
				576	result = targetExhausted; break;
				577	}
				578	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				579	/* UTF-16 surrogate values are illegal in UTF-32 */
				580	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				581	if (flags == strictConversion) {
				582	source -= (extraBytesToRead+1); /* return to the illegal value itself */
				583	result = sourceIllegal;
				584	break;
				585	} else {
				586	*target++ = UNI_REPLACEMENT_CHAR;
				587	}
				588	} else {
				589	target++ = (UTF16)ch; / normal case */
				590	}
				591	} else if (ch > UNI_MAX_UTF16) {
				592	if (flags == strictConversion) {
				593	result = sourceIllegal;
				594	source -= (extraBytesToRead+1); /* return to the start */
				595	break; /* Bail out; shouldn't continue */
				596	} else {
				597	*target++ = UNI_REPLACEMENT_CHAR;
				598	}
				599	} else {
				600	/* target is a character in range 0xFFFF - 0x10FFFF. */
				601	if (target + 1 >= targetEnd) {
				602	source -= (extraBytesToRead+1); /* Back up source pointer! */
				603	result = targetExhausted; break;
				604	}
				605	ch -= halfBase;
				606	*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
				607	*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
				608	}
				609	}
				610	*sourceStart = source;
				611	*targetStart = target;
				612	return result;
				613	}
				614
				615	/* --------------------------------------------------------------------- */
				616
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	617	static ConversionResult ConvertUTF8toUTF32Impl(
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	618	const UTF8** sourceStart, const UTF8* sourceEnd,
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	619	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
				620	Boolean InputIsPartial) {
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	621	ConversionResult result = conversionOK;
				622	const UTF8* source = *sourceStart;
				623	UTF32* target = *targetStart;
				624	while (source < sourceEnd) {
				625	UTF32 ch = 0;
				626	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
				627	if (extraBytesToRead >= sourceEnd - source) {
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	628	if (flags == strictConversion \|\| InputIsPartial) {
				629	result = sourceExhausted;
				630	break;
				631	} else {
				632	result = sourceIllegal;
				633
				634	/*
				635	* Replace the maximal subpart of ill-formed sequence with
				636	* replacement character.
				637	*/
				638	source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
				639	sourceEnd);
				640	*target++ = UNI_REPLACEMENT_CHAR;
				641	continue;
				642	}
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	643	}
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	644	if (target >= targetEnd) {
				645	result = targetExhausted; break;
				646	}
				647
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	648	/* Do this check whether lenient or strict */
				649	if (!isLegalUTF8(source, extraBytesToRead+1)) {
				650	result = sourceIllegal;
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	651	if (flags == strictConversion) {
				652	/* Abort conversion. */
				653	break;
				654	} else {
				655	/*
				656	* Replace the maximal subpart of ill-formed sequence with
				657	* replacement character.
				658	*/
				659	source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
				660	sourceEnd);
				661	*target++ = UNI_REPLACEMENT_CHAR;
				662	continue;
				663	}
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	664	}
				665	/*
				666	* The cases all fall through. See "Note A" below.
				667	*/
				668	switch (extraBytesToRead) {
				669	case 5: ch += *source++; ch <<= 6;
				670	case 4: ch += *source++; ch <<= 6;
				671	case 3: ch += *source++; ch <<= 6;
				672	case 2: ch += *source++; ch <<= 6;
				673	case 1: ch += *source++; ch <<= 6;
				674	case 0: ch += *source++;
				675	}
				676	ch -= offsetsFromUTF8[extraBytesToRead];
				677
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	678	if (ch <= UNI_MAX_LEGAL_UTF32) {
				679	/*
				680	* UTF-16 surrogate values are illegal in UTF-32, and anything
				681	* over Plane 17 (> 0x10FFFF) is illegal.
				682	*/
				683	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				684	if (flags == strictConversion) {
				685	source -= (extraBytesToRead+1); /* return to the illegal value itself */
				686	result = sourceIllegal;
				687	break;
				688	} else {
				689	*target++ = UNI_REPLACEMENT_CHAR;
				690	}
				691	} else {
				692	*target++ = ch;
				693	}
				694	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
				695	result = sourceIllegal;
				696	*target++ = UNI_REPLACEMENT_CHAR;
				697	}
				698	}
				699	*sourceStart = source;
				700	*targetStart = target;
				701	return result;
				702	}
				703
Dmitri Gribenko	1089db0	2014-06-16 11:09:46 +0000	[diff] [blame]	704	ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
				705	const UTF8 *sourceEnd,
				706	UTF32 **targetStart,
				707	UTF32 *targetEnd,
				708	ConversionFlags flags) {
				709	return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
				710	flags, /InputIsPartial=/true);
				711	}
				712
				713	ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
				714	const UTF8 sourceEnd, UTF32 *targetStart,
				715	UTF32 *targetEnd, ConversionFlags flags) {
				716	return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
				717	flags, /InputIsPartial=/false);
				718	}
				719
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	720	/* ---------------------------------------------------------------------
				721
				722	Note A.
				723	The fall-through switches in UTF-8 reading code save a
				724	temp variable, some decrements & conditionals. The switches
				725	are equivalent to the following loop:
				726	{
				727	int tmpBytesToRead = extraBytesToRead+1;
				728	do {
				729	ch += *source++;
				730	--tmpBytesToRead;
				731	if (tmpBytesToRead) ch <<= 6;
				732	} while (tmpBytesToRead > 0);
				733	}
				734	In UTF-8 writing code, the switches on "bytesToWrite" are
				735	similarly unrolled loops.
				736
				737	--------------------------------------------------------------------- */
Justin Lebar	9091055	2016-09-30 00:38:45 +0000	[diff] [blame]	738
				739	} // namespace llvm
Galina Kistanova	229c9c1	2017-05-29 01:34:26 +0000	[diff] [blame]	740
				741	ConvertUTF_RESTORE_WARNINGS