Blame - clang/lib/Format/Encoding.h - toolchain/llvm-project

blob: 148f7fd0e91ba5dc1ee0572ebd02b4ba5d34979b [file] [log] [blame]

Alexander Kornienko	ffcc010	2013-06-05 14:09:10 +0000	[diff] [blame]	1	//===--- Encoding.h - Format C++ code -------------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	///
				10	/// \file
				11	/// \brief Contains functions for text encoding manipulation. Supports UTF-8,
				12	/// 8-bit encodings and escape sequences in C++ string literals.
				13	///
				14	//===----------------------------------------------------------------------===//
				15
Benjamin Kramer	2f5db8b	2014-08-13 16:25:19 +0000	[diff] [blame]	16	#ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
				17	#define LLVM_CLANG_LIB_FORMAT_ENCODING_H
Alexander Kornienko	ffcc010	2013-06-05 14:09:10 +0000	[diff] [blame]	18
				19	#include "clang/Basic/LLVM.h"
Mehdi Amini	7322b8b	2016-04-18 09:08:59 +0000	[diff] [blame^]	20	#include "llvm/ADT/StringRef.h"
Alexander Kornienko	ffcc010	2013-06-05 14:09:10 +0000	[diff] [blame]	21	#include "llvm/Support/ConvertUTF.h"
Alexander Kornienko	ebb43ca	2013-09-05 14:08:34 +0000	[diff] [blame]	22	#include "llvm/Support/Unicode.h"
Alexander Kornienko	ffcc010	2013-06-05 14:09:10 +0000	[diff] [blame]	23
				24	namespace clang {
				25	namespace format {
				26	namespace encoding {
				27
				28	enum Encoding {
				29	Encoding_UTF8,
				30	Encoding_Unknown // We treat all other encodings as 8-bit encodings.
				31	};
				32
				33	/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
				34	/// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
				35	inline Encoding detectEncoding(StringRef Text) {
				36	const UTF8 Ptr = reinterpret_cast<const UTF8 >(Text.begin());
				37	const UTF8 BufEnd = reinterpret_cast<const UTF8 >(Text.end());
				38	if (::isLegalUTF8String(&Ptr, BufEnd))
				39	return Encoding_UTF8;
				40	return Encoding_Unknown;
				41	}
				42
				43	inline unsigned getCodePointCountUTF8(StringRef Text) {
				44	unsigned CodePoints = 0;
				45	for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
				46	++CodePoints;
				47	}
				48	return CodePoints;
				49	}
				50
				51	/// \brief Gets the number of code points in the Text using the specified
				52	/// Encoding.
				53	inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
				54	switch (Encoding) {
Daniel Jasper	3ac9b9e	2013-07-08 14:34:09 +0000	[diff] [blame]	55	case Encoding_UTF8:
				56	return getCodePointCountUTF8(Text);
				57	default:
				58	return Text.size();
Alexander Kornienko	ffcc010	2013-06-05 14:09:10 +0000	[diff] [blame]	59	}
				60	}
				61
Alexander Kornienko	ebb43ca	2013-09-05 14:08:34 +0000	[diff] [blame]	62	/// \brief Returns the number of columns required to display the \p Text on a
				63	/// generic Unicode-capable terminal. Text is assumed to use the specified
				64	/// \p Encoding.
				65	inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
				66	if (Encoding == Encoding_UTF8) {
				67	int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
Alexander Kornienko	71d95d6	2013-11-26 10:38:53 +0000	[diff] [blame]	68	// FIXME: Figure out the correct way to handle this in the presence of both
				69	// printable and unprintable multi-byte UTF-8 characters. Falling back to
				70	// returning the number of bytes may cause problems, as columnWidth suddenly
				71	// becomes non-additive.
Alexander Kornienko	ebb43ca	2013-09-05 14:08:34 +0000	[diff] [blame]	72	if (ContentWidth >= 0)
				73	return ContentWidth;
				74	}
				75	return Text.size();
				76	}
				77
				78	/// \brief Returns the number of columns required to display the \p Text,
				79	/// starting from the \p StartColumn on a terminal with the \p TabWidth. The
				80	/// text is assumed to use the specified \p Encoding.
				81	inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
				82	unsigned TabWidth, Encoding Encoding) {
				83	unsigned TotalWidth = 0;
				84	StringRef Tail = Text;
				85	for (;;) {
				86	StringRef::size_type TabPos = Tail.find('\t');
				87	if (TabPos == StringRef::npos)
				88	return TotalWidth + columnWidth(Tail, Encoding);
Alexander Kornienko	71d95d6	2013-11-26 10:38:53 +0000	[diff] [blame]	89	TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
Alexander Kornienko	ebb43ca	2013-09-05 14:08:34 +0000	[diff] [blame]	90	TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
				91	Tail = Tail.substr(TabPos + 1);
				92	}
				93	}
				94
Alexander Kornienko	ffcc010	2013-06-05 14:09:10 +0000	[diff] [blame]	95	/// \brief Gets the number of bytes in a sequence representing a single
				96	/// codepoint and starting with FirstChar in the specified Encoding.
				97	inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
				98	switch (Encoding) {
Daniel Jasper	3ac9b9e	2013-07-08 14:34:09 +0000	[diff] [blame]	99	case Encoding_UTF8:
				100	return getNumBytesForUTF8(FirstChar);
				101	default:
				102	return 1;
Alexander Kornienko	ffcc010	2013-06-05 14:09:10 +0000	[diff] [blame]	103	}
				104	}
				105
Daniel Jasper	3ac9b9e	2013-07-08 14:34:09 +0000	[diff] [blame]	106	inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
Alexander Kornienko	ffcc010	2013-06-05 14:09:10 +0000	[diff] [blame]	107
				108	inline bool isHexDigit(char c) {
				109	return ('0' <= c && c <= '9') \|\| ('a' <= c && c <= 'f') \|\|
				110	('A' <= c && c <= 'F');
				111	}
				112
				113	/// \brief Gets the length of an escape sequence inside a C++ string literal.
				114	/// Text should span from the beginning of the escape sequence (starting with a
				115	/// backslash) to the end of the string literal.
				116	inline unsigned getEscapeSequenceLength(StringRef Text) {
				117	assert(Text[0] == '\\');
				118	if (Text.size() < 2)
				119	return 1;
				120
				121	switch (Text[1]) {
				122	case 'u':
				123	return 6;
				124	case 'U':
				125	return 10;
				126	case 'x': {
				127	unsigned I = 2; // Point after '\x'.
				128	while (I < Text.size() && isHexDigit(Text[I]))
				129	++I;
				130	return I;
				131	}
				132	default:
				133	if (isOctDigit(Text[1])) {
				134	unsigned I = 1;
				135	while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
				136	++I;
				137	return I;
				138	}
Daniel Jasper	e35c220	2015-07-20 23:28:07 +0000	[diff] [blame]	139	return 1 + getNumBytesForUTF8(Text[1]);
Alexander Kornienko	ffcc010	2013-06-05 14:09:10 +0000	[diff] [blame]	140	}
				141	}
				142
				143	} // namespace encoding
				144	} // namespace format
				145	} // namespace clang
				146
Benjamin Kramer	2f5db8b	2014-08-13 16:25:19 +0000	[diff] [blame]	147	#endif