Blame - llvm/lib/Support/ConvertUTFWrapper.cpp - toolchain/llvm-project

blob: f3cef5240f6f4908cb18b4ee1bb6c786b6b686cb [file] [log] [blame]

Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	1	//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9
				10	#include "llvm/Support/ConvertUTF.h"
Marianne Mailhot-Sarrasin	7423f40	2016-03-11 15:59:32 +0000	[diff] [blame^]	11	#include "llvm/Support/ErrorHandling.h"
Reid Kleckner	7df03c2	2013-07-16 17:14:33 +0000	[diff] [blame]	12	#include "llvm/Support/SwapByteOrder.h"
				13	#include <string>
				14	#include <vector>
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	15
				16	namespace llvm {
				17
				18	bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
				19	char &ResultPtr, const UTF8 &ErrorPtr) {
				20	assert(WideCharWidth == 1 \|\| WideCharWidth == 2 \|\| WideCharWidth == 4);
				21	ConversionResult result = conversionOK;
				22	// Copy the character span over.
				23	if (WideCharWidth == 1) {
				24	const UTF8 Pos = reinterpret_cast<const UTF8>(Source.begin());
				25	if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {
				26	result = sourceIllegal;
				27	ErrorPtr = Pos;
				28	} else {
				29	memcpy(ResultPtr, Source.data(), Source.size());
				30	ResultPtr += Source.size();
				31	}
				32	} else if (WideCharWidth == 2) {
				33	const UTF8 sourceStart = (const UTF8)Source.data();
				34	// FIXME: Make the type of the result buffer correct instead of
				35	// using reinterpret_cast.
				36	UTF16 targetStart = reinterpret_cast<UTF16>(ResultPtr);
				37	ConversionFlags flags = strictConversion;
				38	result = ConvertUTF8toUTF16(
				39	&sourceStart, sourceStart + Source.size(),
Marianne Mailhot-Sarrasin	7423f40	2016-03-11 15:59:32 +0000	[diff] [blame^]	40	&targetStart, targetStart + Source.size(), flags);
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	41	if (result == conversionOK)
				42	ResultPtr = reinterpret_cast<char*>(targetStart);
				43	else
				44	ErrorPtr = sourceStart;
				45	} else if (WideCharWidth == 4) {
				46	const UTF8 sourceStart = (const UTF8)Source.data();
				47	// FIXME: Make the type of the result buffer correct instead of
				48	// using reinterpret_cast.
				49	UTF32 targetStart = reinterpret_cast<UTF32>(ResultPtr);
				50	ConversionFlags flags = strictConversion;
				51	result = ConvertUTF8toUTF32(
				52	&sourceStart, sourceStart + Source.size(),
Marianne Mailhot-Sarrasin	7423f40	2016-03-11 15:59:32 +0000	[diff] [blame^]	53	&targetStart, targetStart + Source.size(), flags);
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	54	if (result == conversionOK)
				55	ResultPtr = reinterpret_cast<char*>(targetStart);
				56	else
				57	ErrorPtr = sourceStart;
				58	}
				59	assert((result != targetExhausted)
				60	&& "ConvertUTF8toUTFXX exhausted target buffer");
				61	return result == conversionOK;
				62	}
				63
				64	bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
				65	const UTF32 *SourceStart = &Source;
				66	const UTF32 *SourceEnd = SourceStart + 1;
				67	UTF8 TargetStart = reinterpret_cast<UTF8 >(ResultPtr);
				68	UTF8 *TargetEnd = TargetStart + 4;
				69	ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,
				70	&TargetStart, TargetEnd,
				71	strictConversion);
				72	if (CR != conversionOK)
				73	return false;
				74
				75	ResultPtr = reinterpret_cast<char*>(TargetStart);
				76	return true;
				77	}
				78
Reid Kleckner	7df03c2	2013-07-16 17:14:33 +0000	[diff] [blame]	79	bool hasUTF16ByteOrderMark(ArrayRef<char> S) {
				80	return (S.size() >= 2 &&
				81	((S[0] == '\xff' && S[1] == '\xfe') \|\|
				82	(S[0] == '\xfe' && S[1] == '\xff')));
				83	}
				84
				85	bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
				86	assert(Out.empty());
				87
				88	// Error out on an uneven byte count.
				89	if (SrcBytes.size() % 2)
				90	return false;
				91
				92	// Avoid OOB by returning early on empty input.
				93	if (SrcBytes.empty())
				94	return true;
				95
				96	const UTF16 Src = reinterpret_cast<const UTF16 >(SrcBytes.begin());
				97	const UTF16 SrcEnd = reinterpret_cast<const UTF16 >(SrcBytes.end());
				98
				99	// Byteswap if necessary.
				100	std::vector<UTF16> ByteSwapped;
				101	if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
				102	ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
				103	for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I)
				104	ByteSwapped[I] = llvm::sys::SwapByteOrder_16(ByteSwapped[I]);
				105	Src = &ByteSwapped[0];
				106	SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
				107	}
				108
				109	// Skip the BOM for conversion.
				110	if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)
				111	Src++;
				112
Zachary Turner	02991af	2015-01-26 22:05:50 +0000	[diff] [blame]	113	// Just allocate enough space up front. We'll shrink it later. Allocate
				114	// enough that we can fit a null terminator without reallocating.
				115	Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);
Reid Kleckner	7df03c2	2013-07-16 17:14:33 +0000	[diff] [blame]	116	UTF8 Dst = reinterpret_cast<UTF8 >(&Out[0]);
				117	UTF8 *DstEnd = Dst + Out.size();
				118
				119	ConversionResult CR =
				120	ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
				121	assert(CR != targetExhausted);
				122
				123	if (CR != conversionOK) {
				124	Out.clear();
				125	return false;
				126	}
				127
				128	Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
Zachary Turner	02991af	2015-01-26 22:05:50 +0000	[diff] [blame]	129	Out.push_back(0);
				130	Out.pop_back();
Reid Kleckner	7df03c2	2013-07-16 17:14:33 +0000	[diff] [blame]	131	return true;
				132	}
				133
Marianne Mailhot-Sarrasin	7423f40	2016-03-11 15:59:32 +0000	[diff] [blame^]	134	bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)
				135	{
				136	return convertUTF16ToUTF8String(
				137	llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
				138	Src.size() * sizeof(UTF16)), Out);
				139	}
				140
Reid Kleckner	d8cb6b0	2015-01-26 19:51:00 +0000	[diff] [blame]	141	bool convertUTF8ToUTF16String(StringRef SrcUTF8,
				142	SmallVectorImpl<UTF16> &DstUTF16) {
				143	assert(DstUTF16.empty());
				144
				145	// Avoid OOB by returning early on empty input.
Zachary Turner	afdff42	2015-02-08 18:08:51 +0000	[diff] [blame]	146	if (SrcUTF8.empty()) {
				147	DstUTF16.push_back(0);
				148	DstUTF16.pop_back();
Reid Kleckner	d8cb6b0	2015-01-26 19:51:00 +0000	[diff] [blame]	149	return true;
Zachary Turner	afdff42	2015-02-08 18:08:51 +0000	[diff] [blame]	150	}
Reid Kleckner	d8cb6b0	2015-01-26 19:51:00 +0000	[diff] [blame]	151
				152	const UTF8 Src = reinterpret_cast<const UTF8 >(SrcUTF8.begin());
				153	const UTF8 SrcEnd = reinterpret_cast<const UTF8 >(SrcUTF8.end());
				154
				155	// Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
				156	// as UTF-16 should always require the same amount or less code units than the
Zachary Turner	02991af	2015-01-26 22:05:50 +0000	[diff] [blame]	157	// UTF-8 encoding. Allocate one extra byte for the null terminator though,
				158	// so that someone calling DstUTF16.data() gets a null terminated string.
				159	// We resize down later so we don't have to worry that this over allocates.
				160	DstUTF16.resize(SrcUTF8.size()+1);
Reid Kleckner	d8cb6b0	2015-01-26 19:51:00 +0000	[diff] [blame]	161	UTF16 *Dst = &DstUTF16[0];
				162	UTF16 *DstEnd = Dst + DstUTF16.size();
				163
				164	ConversionResult CR =
				165	ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
				166	assert(CR != targetExhausted);
				167
				168	if (CR != conversionOK) {
				169	DstUTF16.clear();
				170	return false;
				171	}
				172
				173	DstUTF16.resize(Dst - &DstUTF16[0]);
Zachary Turner	02991af	2015-01-26 22:05:50 +0000	[diff] [blame]	174	DstUTF16.push_back(0);
				175	DstUTF16.pop_back();
Reid Kleckner	d8cb6b0	2015-01-26 19:51:00 +0000	[diff] [blame]	176	return true;
				177	}
				178
Marianne Mailhot-Sarrasin	7423f40	2016-03-11 15:59:32 +0000	[diff] [blame^]	179	static_assert(sizeof(wchar_t) == 1 \|\| sizeof(wchar_t) == 2 \|\|
				180	sizeof(wchar_t) == 4,
				181	"Expected wchar_t to be 1, 2, or 4 bytes");
				182
				183	template <typename TResult>
				184	static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source,
				185	TResult &Result) {
				186	// Even in the case of UTF-16, the number of bytes in a UTF-8 string is
				187	// at least as large as the number of elements in the resulting wide
				188	// string, because surrogate pairs take at least 4 bytes in UTF-8.
				189	Result.resize(Source.size() + 1);
				190	char ResultPtr = reinterpret_cast<char >(&Result[0]);
				191	const UTF8 *ErrorPtr;
				192	if (!ConvertUTF8toWide(sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) {
				193	Result.clear();
				194	return false;
				195	}
				196	Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[0]);
				197	return true;
				198	}
				199
				200	bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) {
				201	return ConvertUTF8toWideInternal(Source, Result);
				202	}
				203
				204	bool ConvertUTF8toWide(const char *Source, std::wstring &Result) {
				205	if (!Source) {
				206	Result.clear();
				207	return true;
				208	}
				209	return ConvertUTF8toWide(llvm::StringRef(Source), Result);
				210	}
				211
				212	bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {
				213	if (sizeof(wchar_t) == 1) {
				214	const UTF8 Start = reinterpret_cast<const UTF8 >(Source.data());
				215	const UTF8 *End =
				216	reinterpret_cast<const UTF8 *>(Source.data() + Source.size());
				217	if (!isLegalUTF8String(&Start, End))
				218	return false;
				219	Result.resize(Source.size());
				220	memcpy(&Result[0], Source.data(), Source.size());
				221	return true;
				222	} else if (sizeof(wchar_t) == 2) {
				223	return convertUTF16ToUTF8String(
				224	llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()),
				225	Source.size()),
				226	Result);
				227	} else if (sizeof(wchar_t) == 4) {
				228	const UTF32 Start = reinterpret_cast<const UTF32 >(Source.data());
				229	const UTF32 *End =
				230	reinterpret_cast<const UTF32 *>(Source.data() + Source.size());
				231	Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size());
				232	UTF8 ResultPtr = reinterpret_cast<UTF8 >(&Result[0]);
				233	UTF8 ResultEnd = reinterpret_cast<UTF8 >(&Result[0] + Result.size());
				234	if (ConvertUTF32toUTF8(&Start, End, &ResultPtr, ResultEnd,
				235	strictConversion) == conversionOK) {
				236	Result.resize(reinterpret_cast<char *>(ResultPtr) - &Result[0]);
				237	return true;
				238	} else {
				239	Result.clear();
				240	return false;
				241	}
				242	} else {
				243	llvm_unreachable(
				244	"Control should never reach this point; see static_assert further up");
				245	}
				246	}
				247
Dmitri Gribenko	b311f4e	2013-01-30 12:05:05 +0000	[diff] [blame]	248	} // end namespace llvm
				249