Blame - clang/lib/Format/Encoding.h - toolchain/llvm-project

blob: a44f4590a24873034f23fa6df782a978f8c50135 [file] [log] [blame]

Alexander Kornienko	ffcc010	2013-06-05 14:09:10 +0000	[diff] [blame^]	1	//===--- Encoding.h - Format C++ code -------------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	///
				10	/// \file
				11	/// \brief Contains functions for text encoding manipulation. Supports UTF-8,
				12	/// 8-bit encodings and escape sequences in C++ string literals.
				13	///
				14	//===----------------------------------------------------------------------===//
				15
				16	#ifndef LLVM_CLANG_FORMAT_ENCODING_H
				17	#define LLVM_CLANG_FORMAT_ENCODING_H
				18
				19	#include "clang/Basic/LLVM.h"
				20	#include "llvm/Support/ConvertUTF.h"
				21
				22	namespace clang {
				23	namespace format {
				24	namespace encoding {
				25
				26	enum Encoding {
				27	Encoding_UTF8,
				28	Encoding_Unknown // We treat all other encodings as 8-bit encodings.
				29	};
				30
				31	/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
				32	/// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
				33	inline Encoding detectEncoding(StringRef Text) {
				34	const UTF8 Ptr = reinterpret_cast<const UTF8 >(Text.begin());
				35	const UTF8 BufEnd = reinterpret_cast<const UTF8 >(Text.end());
				36	if (::isLegalUTF8String(&Ptr, BufEnd))
				37	return Encoding_UTF8;
				38	return Encoding_Unknown;
				39	}
				40
				41	inline unsigned getCodePointCountUTF8(StringRef Text) {
				42	unsigned CodePoints = 0;
				43	for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
				44	++CodePoints;
				45	}
				46	return CodePoints;
				47	}
				48
				49	/// \brief Gets the number of code points in the Text using the specified
				50	/// Encoding.
				51	inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
				52	switch (Encoding) {
				53	case Encoding_UTF8:
				54	return getCodePointCountUTF8(Text);
				55	default:
				56	return Text.size();
				57	}
				58	}
				59
				60	/// \brief Gets the number of bytes in a sequence representing a single
				61	/// codepoint and starting with FirstChar in the specified Encoding.
				62	inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
				63	switch (Encoding) {
				64	case Encoding_UTF8:
				65	return getNumBytesForUTF8(FirstChar);
				66	default:
				67	return 1;
				68	}
				69	}
				70
				71	inline bool isOctDigit(char c) {
				72	return '0' <= c && c <= '7';
				73	}
				74
				75	inline bool isHexDigit(char c) {
				76	return ('0' <= c && c <= '9') \|\| ('a' <= c && c <= 'f') \|\|
				77	('A' <= c && c <= 'F');
				78	}
				79
				80	/// \brief Gets the length of an escape sequence inside a C++ string literal.
				81	/// Text should span from the beginning of the escape sequence (starting with a
				82	/// backslash) to the end of the string literal.
				83	inline unsigned getEscapeSequenceLength(StringRef Text) {
				84	assert(Text[0] == '\\');
				85	if (Text.size() < 2)
				86	return 1;
				87
				88	switch (Text[1]) {
				89	case 'u':
				90	return 6;
				91	case 'U':
				92	return 10;
				93	case 'x': {
				94	unsigned I = 2; // Point after '\x'.
				95	while (I < Text.size() && isHexDigit(Text[I]))
				96	++I;
				97	return I;
				98	}
				99	default:
				100	if (isOctDigit(Text[1])) {
				101	unsigned I = 1;
				102	while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
				103	++I;
				104	return I;
				105	}
				106	return 2;
				107	}
				108	}
				109
				110	} // namespace encoding
				111	} // namespace format
				112	} // namespace clang
				113
				114	#endif // LLVM_CLANG_FORMAT_ENCODING_H