blob: a44f4590a24873034f23fa6df782a978f8c50135 [file] [log] [blame]
Alexander Kornienkoffcc0102013-06-05 14:09:10 +00001//===--- Encoding.h - Format C++ code -------------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Contains functions for text encoding manipulation. Supports UTF-8,
12/// 8-bit encodings and escape sequences in C++ string literals.
13///
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CLANG_FORMAT_ENCODING_H
17#define LLVM_CLANG_FORMAT_ENCODING_H
18
19#include "clang/Basic/LLVM.h"
20#include "llvm/Support/ConvertUTF.h"
21
22namespace clang {
23namespace format {
24namespace encoding {
25
26enum Encoding {
27 Encoding_UTF8,
28 Encoding_Unknown // We treat all other encodings as 8-bit encodings.
29};
30
31/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
32/// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
33inline Encoding detectEncoding(StringRef Text) {
34 const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
35 const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
36 if (::isLegalUTF8String(&Ptr, BufEnd))
37 return Encoding_UTF8;
38 return Encoding_Unknown;
39}
40
41inline unsigned getCodePointCountUTF8(StringRef Text) {
42 unsigned CodePoints = 0;
43 for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
44 ++CodePoints;
45 }
46 return CodePoints;
47}
48
49/// \brief Gets the number of code points in the Text using the specified
50/// Encoding.
51inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
52 switch (Encoding) {
53 case Encoding_UTF8:
54 return getCodePointCountUTF8(Text);
55 default:
56 return Text.size();
57 }
58}
59
60/// \brief Gets the number of bytes in a sequence representing a single
61/// codepoint and starting with FirstChar in the specified Encoding.
62inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
63 switch (Encoding) {
64 case Encoding_UTF8:
65 return getNumBytesForUTF8(FirstChar);
66 default:
67 return 1;
68 }
69}
70
71inline bool isOctDigit(char c) {
72 return '0' <= c && c <= '7';
73}
74
75inline bool isHexDigit(char c) {
76 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
77 ('A' <= c && c <= 'F');
78}
79
80/// \brief Gets the length of an escape sequence inside a C++ string literal.
81/// Text should span from the beginning of the escape sequence (starting with a
82/// backslash) to the end of the string literal.
83inline unsigned getEscapeSequenceLength(StringRef Text) {
84 assert(Text[0] == '\\');
85 if (Text.size() < 2)
86 return 1;
87
88 switch (Text[1]) {
89 case 'u':
90 return 6;
91 case 'U':
92 return 10;
93 case 'x': {
94 unsigned I = 2; // Point after '\x'.
95 while (I < Text.size() && isHexDigit(Text[I]))
96 ++I;
97 return I;
98 }
99 default:
100 if (isOctDigit(Text[1])) {
101 unsigned I = 1;
102 while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
103 ++I;
104 return I;
105 }
106 return 2;
107 }
108}
109
110} // namespace encoding
111} // namespace format
112} // namespace clang
113
114#endif // LLVM_CLANG_FORMAT_ENCODING_H