blob: b2103cb412eed8117382e7eb3802302b1029b327 [file] [log] [blame]
Alexander Kornienkoffcc0102013-06-05 14:09:10 +00001//===--- Encoding.h - Format C++ code -------------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Contains functions for text encoding manipulation. Supports UTF-8,
12/// 8-bit encodings and escape sequences in C++ string literals.
13///
14//===----------------------------------------------------------------------===//
15
Benjamin Kramer2f5db8b2014-08-13 16:25:19 +000016#ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
17#define LLVM_CLANG_LIB_FORMAT_ENCODING_H
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000018
19#include "clang/Basic/LLVM.h"
Mehdi Amini7322b8b2016-04-18 09:08:59 +000020#include "llvm/ADT/StringRef.h"
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000021#include "llvm/Support/ConvertUTF.h"
Alexander Kornienkoebb43ca2013-09-05 14:08:34 +000022#include "llvm/Support/Unicode.h"
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000023
24namespace clang {
25namespace format {
26namespace encoding {
27
28enum Encoding {
29 Encoding_UTF8,
30 Encoding_Unknown // We treat all other encodings as 8-bit encodings.
31};
32
33/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
34/// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
35inline Encoding detectEncoding(StringRef Text) {
Justin Lebar90910552016-09-30 00:38:45 +000036 const llvm::UTF8 *Ptr = reinterpret_cast<const llvm::UTF8 *>(Text.begin());
37 const llvm::UTF8 *BufEnd = reinterpret_cast<const llvm::UTF8 *>(Text.end());
38 if (llvm::isLegalUTF8String(&Ptr, BufEnd))
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000039 return Encoding_UTF8;
40 return Encoding_Unknown;
41}
42
43inline unsigned getCodePointCountUTF8(StringRef Text) {
44 unsigned CodePoints = 0;
Justin Lebar90910552016-09-30 00:38:45 +000045 for (size_t i = 0, e = Text.size(); i < e;
46 i += llvm::getNumBytesForUTF8(Text[i])) {
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000047 ++CodePoints;
48 }
49 return CodePoints;
50}
51
52/// \brief Gets the number of code points in the Text using the specified
53/// Encoding.
54inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
55 switch (Encoding) {
Daniel Jasper3ac9b9e2013-07-08 14:34:09 +000056 case Encoding_UTF8:
57 return getCodePointCountUTF8(Text);
58 default:
59 return Text.size();
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000060 }
61}
62
Alexander Kornienkoebb43ca2013-09-05 14:08:34 +000063/// \brief Returns the number of columns required to display the \p Text on a
64/// generic Unicode-capable terminal. Text is assumed to use the specified
65/// \p Encoding.
66inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
67 if (Encoding == Encoding_UTF8) {
68 int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
Alexander Kornienko71d95d62013-11-26 10:38:53 +000069 // FIXME: Figure out the correct way to handle this in the presence of both
70 // printable and unprintable multi-byte UTF-8 characters. Falling back to
71 // returning the number of bytes may cause problems, as columnWidth suddenly
72 // becomes non-additive.
Alexander Kornienkoebb43ca2013-09-05 14:08:34 +000073 if (ContentWidth >= 0)
74 return ContentWidth;
75 }
76 return Text.size();
77}
78
79/// \brief Returns the number of columns required to display the \p Text,
80/// starting from the \p StartColumn on a terminal with the \p TabWidth. The
81/// text is assumed to use the specified \p Encoding.
82inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
83 unsigned TabWidth, Encoding Encoding) {
84 unsigned TotalWidth = 0;
85 StringRef Tail = Text;
86 for (;;) {
87 StringRef::size_type TabPos = Tail.find('\t');
88 if (TabPos == StringRef::npos)
89 return TotalWidth + columnWidth(Tail, Encoding);
Alexander Kornienko71d95d62013-11-26 10:38:53 +000090 TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
Alexander Kornienkoebb43ca2013-09-05 14:08:34 +000091 TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
92 Tail = Tail.substr(TabPos + 1);
93 }
94}
95
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000096/// \brief Gets the number of bytes in a sequence representing a single
97/// codepoint and starting with FirstChar in the specified Encoding.
98inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
99 switch (Encoding) {
Daniel Jasper3ac9b9e2013-07-08 14:34:09 +0000100 case Encoding_UTF8:
Justin Lebar90910552016-09-30 00:38:45 +0000101 return llvm::getNumBytesForUTF8(FirstChar);
Daniel Jasper3ac9b9e2013-07-08 14:34:09 +0000102 default:
103 return 1;
Alexander Kornienkoffcc0102013-06-05 14:09:10 +0000104 }
105}
106
Daniel Jasper3ac9b9e2013-07-08 14:34:09 +0000107inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
Alexander Kornienkoffcc0102013-06-05 14:09:10 +0000108
109inline bool isHexDigit(char c) {
110 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
111 ('A' <= c && c <= 'F');
112}
113
114/// \brief Gets the length of an escape sequence inside a C++ string literal.
115/// Text should span from the beginning of the escape sequence (starting with a
116/// backslash) to the end of the string literal.
117inline unsigned getEscapeSequenceLength(StringRef Text) {
118 assert(Text[0] == '\\');
119 if (Text.size() < 2)
120 return 1;
121
122 switch (Text[1]) {
123 case 'u':
124 return 6;
125 case 'U':
126 return 10;
127 case 'x': {
128 unsigned I = 2; // Point after '\x'.
129 while (I < Text.size() && isHexDigit(Text[I]))
130 ++I;
131 return I;
132 }
133 default:
134 if (isOctDigit(Text[1])) {
135 unsigned I = 1;
136 while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
137 ++I;
138 return I;
139 }
Justin Lebar90910552016-09-30 00:38:45 +0000140 return 1 + llvm::getNumBytesForUTF8(Text[1]);
Alexander Kornienkoffcc0102013-06-05 14:09:10 +0000141 }
142}
143
144} // namespace encoding
145} // namespace format
146} // namespace clang
147
Benjamin Kramer2f5db8b2014-08-13 16:25:19 +0000148#endif