Handle universal character names and Unicode characters outside of literals.
This is a missing piece for C99 conformance.
This patch handles UCNs by adding a '\\' case to LexTokenInternal and
LexIdentifier -- if we see a backslash, we tentatively try to read in a UCN.
If the UCN is not syntactically well-formed, we fall back to the old
treatment: a backslash followed by an identifier beginning with 'u' (or 'U').
Because the spelling of an identifier with UCNs still has the UCN in it, we
need to convert that to UTF-8 in Preprocessor::LookUpIdentifierInfo.
Of course, valid code that does *not* use UCNs will see only a very minimal
performance hit (checks after each identifier for non-ASCII characters,
checks when converting raw_identifiers to identifiers that they do not
contain UCNs, and checks when getting the spelling of an identifier that it
does not contain a UCN).
This patch also adds basic support for actual UTF-8 in the source. This is
treated almost exactly the same as UCNs except that we consider stray
Unicode characters to be mistakes and offer a fixit to remove them.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@173369 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/include/clang/Basic/ConvertUTF.h b/include/clang/Basic/ConvertUTF.h
index fb05afd..38956ee 100644
--- a/include/clang/Basic/ConvertUTF.h
+++ b/include/clang/Basic/ConvertUTF.h
@@ -161,6 +161,16 @@
unsigned getNumBytesForUTF8(UTF8 firstByte);
+static inline ConversionResult convertUTF8Sequence(const UTF8 **source,
+ const UTF8 *sourceEnd,
+ UTF32 *target,
+ ConversionFlags flags) {
+ unsigned size = getNumBytesForUTF8(**source);
+ if (size > sourceEnd - *source)
+ return sourceExhausted;
+ return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
+}
+
#ifdef __cplusplus
}
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td
index c8b4423..00b385e 100644
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -93,15 +93,29 @@
"multi-character character constant">, InGroup<MultiChar>;
def ext_four_char_character_literal : Extension<
"multi-character character constant">, InGroup<FourByteMultiChar>;
-
-// Literal
-def ext_nonstandard_escape : Extension<
- "use of non-standard escape character '\\%0'">;
-def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
-def err_hex_escape_no_digits : Error<"\\%0 used with no following hex digits">;
+
+// Unicode and UCNs
+def err_invalid_utf8 : Error<
+ "source file is not valid UTF-8">;
+def err_non_ascii : Error<
+ "non-ASCII characters are not allowed outside of literals and identifiers">;
+def ext_unicode_whitespace : ExtWarn<
+ "treating Unicode character as whitespace">,
+ InGroup<DiagGroup<"unicode-whitespace">>;
+
+def err_hex_escape_no_digits : Error<
+ "\\%0 used with no following hex digits">;
+def warn_ucn_escape_no_digits : Warning<
+ "\\%0 used with no following hex digits; "
+ "treating as '\\' followed by identifier">, InGroup<Unicode>;
+def err_ucn_escape_incomplete : Error<
+ "incomplete universal character name">;
+def warn_ucn_escape_incomplete : Warning<
+ "incomplete universal character name; "
+ "treating as '\\' followed by identifier">, InGroup<Unicode>;
def err_ucn_escape_invalid : Error<"invalid universal character">;
-def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
+
def err_ucn_escape_basic_scs : Error<
"character '%0' cannot be specified by a universal character name">;
def err_ucn_control_character : Error<
@@ -112,6 +126,12 @@
def warn_cxx98_compat_literal_ucn_control_character : Warning<
"universal character name referring to a control character "
"is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
+
+
+// Literal
+def ext_nonstandard_escape : Extension<
+ "use of non-standard escape character '\\%0'">;
+def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;
diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h
index d36189f..535baf5 100644
--- a/include/clang/Lex/Lexer.h
+++ b/include/clang/Lex/Lexer.h
@@ -437,6 +437,11 @@
///
void LexTokenInternal(Token &Result);
+ /// Given that a token begins with the Unicode character \p C, figure out
+ /// what kind of token it is and dispatch to the appropriate lexing helper
+ /// function.
+ void LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
+
/// FormTokenWithChars - When we lex a token, we have identified a span
/// starting at BufferPtr, going to TokEnd that forms the token. This method
/// takes that range and assigns it to the token as its location and size. In
@@ -579,6 +584,21 @@
void cutOffLexing() { BufferPtr = BufferEnd; }
bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
+
+
+ /// Read a universal character name.
+ ///
+ /// \param CurPtr The position in the source buffer after the initial '\'.
+ /// If the UCN is syntactically well-formed (but not necessarily
+ /// valid), this parameter will be updated to point to the
+ /// character after the UCN.
+ /// \param SlashLoc The position in the source buffer of the '\'.
+ /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics
+ /// and handle token formation in the caller.
+ ///
+ /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
+ /// invalid.
+ uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
};
diff --git a/include/clang/Lex/Token.h b/include/clang/Lex/Token.h
index 06ff56e..bcbe9c9 100644
--- a/include/clang/Lex/Token.h
+++ b/include/clang/Lex/Token.h
@@ -74,9 +74,10 @@
StartOfLine = 0x01, // At start of line or only after whitespace.
LeadingSpace = 0x02, // Whitespace exists before this token.
DisableExpand = 0x04, // This identifier may never be macro expanded.
- NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
+ NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
- HasUDSuffix = 0x20 // This string or character literal has a ud-suffix.
+ HasUDSuffix = 0x20, // This string or character literal has a ud-suffix.
+ HasUCN = 0x40 // This identifier contains a UCN.
};
tok::TokenKind getKind() const { return (tok::TokenKind)Kind; }
@@ -257,6 +258,9 @@
/// \brief Return true if this token is a string or character literal which
/// has a ud-suffix.
bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; }
+
+ /// Returns true if this token contains a universal character name.
+ bool hasUCN() const { return (Flags & HasUCN) ? true : false; }
};
/// \brief Information about the conditional stack (\#if directives)