Don't warn about Unicode characters in -E mode.
People use the C preprocessor for things other than C files. Some of them
have Unicode characters. We shouldn't warn about Unicode characters
appearing outside of identifiers in this case.
There's not currently a way for the preprocessor to tell if it's in -E mode,
so I added a new flag, derived from the PreprocessorOutputOptions. This is
only used by the Unicode warnings for now, but could conceivably be used by
other warnings or even behavioral differences later.
<rdar://problem/13107323>
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@173881 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 3e3aaae..08f406b 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -2811,14 +2811,13 @@
}
void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
- if (isUnicodeWhitespace(C)) {
- if (!isLexingRawMode()) {
- CharSourceRange CharRange =
- CharSourceRange::getCharRange(getSourceLocation(),
- getSourceLocation(CurPtr));
- Diag(BufferPtr, diag::ext_unicode_whitespace)
- << CharRange;
- }
+ if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
+ isUnicodeWhitespace(C)) {
+ CharSourceRange CharRange =
+ CharSourceRange::getCharRange(getSourceLocation(),
+ getSourceLocation(CurPtr));
+ Diag(BufferPtr, diag::ext_unicode_whitespace)
+ << CharRange;
Result.setFlag(Token::LeadingSpace);
if (SkipWhitespace(Result, CurPtr))
@@ -2832,7 +2831,8 @@
return LexIdentifier(Result, CurPtr);
}
- if (!isASCII(*BufferPtr) && !isAllowedIDChar(C)) {
+ if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
+ !isASCII(*BufferPtr) && !isAllowedIDChar(C)) {
// Non-ASCII characters tend to creep into source code unintentionally.
// Instead of letting the parser complain about the unknown token,
// just drop the character.
@@ -2842,13 +2842,11 @@
// loophole in the mapping of Unicode characters to basic character set
// characters that allows us to map these particular characters to, say,
// whitespace.
- if (!isLexingRawMode()) {
- CharSourceRange CharRange =
- CharSourceRange::getCharRange(getSourceLocation(),
- getSourceLocation(CurPtr));
- Diag(BufferPtr, diag::err_non_ascii)
- << FixItHint::CreateRemoval(CharRange);
- }
+ CharSourceRange CharRange =
+ CharSourceRange::getCharRange(getSourceLocation(),
+ getSourceLocation(CurPtr));
+ Diag(BufferPtr, diag::err_non_ascii)
+ << FixItHint::CreateRemoval(CharRange);
BufferPtr = CurPtr;
return LexTokenInternal(Result);
@@ -3537,11 +3535,15 @@
if (Status == conversionOK)
return LexUnicode(Result, CodePoint, CurPtr);
+ if (isLexingRawMode() || PP->isPreprocessedOutput()) {
+ Kind = tok::unknown;
+ break;
+ }
+
// Non-ASCII characters tend to creep into source code unintentionally.
// Instead of letting the parser complain about the unknown token,
// just diagnose the invalid UTF-8, then drop the character.
- if (!isLexingRawMode())
- Diag(CurPtr, diag::err_invalid_utf8);
+ Diag(CurPtr, diag::err_invalid_utf8);
BufferPtr = CurPtr+1;
goto LexNextToken;