blob: 148f7fd0e91ba5dc1ee0572ebd02b4ba5d34979b [file] [log] [blame]
Alexander Kornienkoffcc0102013-06-05 14:09:10 +00001//===--- Encoding.h - Format C++ code -------------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Contains functions for text encoding manipulation. Supports UTF-8,
12/// 8-bit encodings and escape sequences in C++ string literals.
13///
14//===----------------------------------------------------------------------===//
15
Benjamin Kramer2f5db8b2014-08-13 16:25:19 +000016#ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
17#define LLVM_CLANG_LIB_FORMAT_ENCODING_H
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000018
19#include "clang/Basic/LLVM.h"
Mehdi Amini7322b8b2016-04-18 09:08:59 +000020#include "llvm/ADT/StringRef.h"
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000021#include "llvm/Support/ConvertUTF.h"
Alexander Kornienkoebb43ca2013-09-05 14:08:34 +000022#include "llvm/Support/Unicode.h"
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000023
24namespace clang {
25namespace format {
26namespace encoding {
27
28enum Encoding {
29 Encoding_UTF8,
30 Encoding_Unknown // We treat all other encodings as 8-bit encodings.
31};
32
33/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
34/// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
35inline Encoding detectEncoding(StringRef Text) {
36 const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
37 const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
38 if (::isLegalUTF8String(&Ptr, BufEnd))
39 return Encoding_UTF8;
40 return Encoding_Unknown;
41}
42
43inline unsigned getCodePointCountUTF8(StringRef Text) {
44 unsigned CodePoints = 0;
45 for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
46 ++CodePoints;
47 }
48 return CodePoints;
49}
50
51/// \brief Gets the number of code points in the Text using the specified
52/// Encoding.
53inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
54 switch (Encoding) {
Daniel Jasper3ac9b9e2013-07-08 14:34:09 +000055 case Encoding_UTF8:
56 return getCodePointCountUTF8(Text);
57 default:
58 return Text.size();
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000059 }
60}
61
Alexander Kornienkoebb43ca2013-09-05 14:08:34 +000062/// \brief Returns the number of columns required to display the \p Text on a
63/// generic Unicode-capable terminal. Text is assumed to use the specified
64/// \p Encoding.
65inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
66 if (Encoding == Encoding_UTF8) {
67 int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
Alexander Kornienko71d95d62013-11-26 10:38:53 +000068 // FIXME: Figure out the correct way to handle this in the presence of both
69 // printable and unprintable multi-byte UTF-8 characters. Falling back to
70 // returning the number of bytes may cause problems, as columnWidth suddenly
71 // becomes non-additive.
Alexander Kornienkoebb43ca2013-09-05 14:08:34 +000072 if (ContentWidth >= 0)
73 return ContentWidth;
74 }
75 return Text.size();
76}
77
78/// \brief Returns the number of columns required to display the \p Text,
79/// starting from the \p StartColumn on a terminal with the \p TabWidth. The
80/// text is assumed to use the specified \p Encoding.
81inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
82 unsigned TabWidth, Encoding Encoding) {
83 unsigned TotalWidth = 0;
84 StringRef Tail = Text;
85 for (;;) {
86 StringRef::size_type TabPos = Tail.find('\t');
87 if (TabPos == StringRef::npos)
88 return TotalWidth + columnWidth(Tail, Encoding);
Alexander Kornienko71d95d62013-11-26 10:38:53 +000089 TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
Alexander Kornienkoebb43ca2013-09-05 14:08:34 +000090 TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
91 Tail = Tail.substr(TabPos + 1);
92 }
93}
94
Alexander Kornienkoffcc0102013-06-05 14:09:10 +000095/// \brief Gets the number of bytes in a sequence representing a single
96/// codepoint and starting with FirstChar in the specified Encoding.
97inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
98 switch (Encoding) {
Daniel Jasper3ac9b9e2013-07-08 14:34:09 +000099 case Encoding_UTF8:
100 return getNumBytesForUTF8(FirstChar);
101 default:
102 return 1;
Alexander Kornienkoffcc0102013-06-05 14:09:10 +0000103 }
104}
105
Daniel Jasper3ac9b9e2013-07-08 14:34:09 +0000106inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
Alexander Kornienkoffcc0102013-06-05 14:09:10 +0000107
108inline bool isHexDigit(char c) {
109 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
110 ('A' <= c && c <= 'F');
111}
112
113/// \brief Gets the length of an escape sequence inside a C++ string literal.
114/// Text should span from the beginning of the escape sequence (starting with a
115/// backslash) to the end of the string literal.
116inline unsigned getEscapeSequenceLength(StringRef Text) {
117 assert(Text[0] == '\\');
118 if (Text.size() < 2)
119 return 1;
120
121 switch (Text[1]) {
122 case 'u':
123 return 6;
124 case 'U':
125 return 10;
126 case 'x': {
127 unsigned I = 2; // Point after '\x'.
128 while (I < Text.size() && isHexDigit(Text[I]))
129 ++I;
130 return I;
131 }
132 default:
133 if (isOctDigit(Text[1])) {
134 unsigned I = 1;
135 while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
136 ++I;
137 return I;
138 }
Daniel Jaspere35c2202015-07-20 23:28:07 +0000139 return 1 + getNumBytesForUTF8(Text[1]);
Alexander Kornienkoffcc0102013-06-05 14:09:10 +0000140 }
141}
142
143} // namespace encoding
144} // namespace format
145} // namespace clang
146
Benjamin Kramer2f5db8b2014-08-13 16:25:19 +0000147#endif