Handle universal character names and Unicode characters outside of literals.
This is a missing piece for C99 conformance.
This patch handles UCNs by adding a '\\' case to LexTokenInternal and
LexIdentifier -- if we see a backslash, we tentatively try to read in a UCN.
If the UCN is not syntactically well-formed, we fall back to the old
treatment: a backslash followed by an identifier beginning with 'u' (or 'U').
Because the spelling of an identifier with UCNs still has the UCN in it, we
need to convert that to UTF-8 in Preprocessor::LookUpIdentifierInfo.
Of course, valid code that does *not* use UCNs will see only a very minimal
performance hit (checks after each identifier for non-ASCII characters,
checks when converting raw_identifiers to identifiers that they do not
contain UCNs, and checks when getting the spelling of an identifier that it
does not contain a UCN).
This patch also adds basic support for actual UTF-8 in the source. This is
treated almost exactly the same as UCNs except that we consider stray
Unicode characters to be mistakes and offer a fixit to remove them.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@173369 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CXX/over/over.oper/over.literal/p8.cpp b/test/CXX/over/over.oper/over.literal/p8.cpp
index 6f63610..70a1843 100644
--- a/test/CXX/over/over.oper/over.literal/p8.cpp
+++ b/test/CXX/over/over.oper/over.literal/p8.cpp
@@ -7,8 +7,7 @@
void operator "" _km(long double); // ok
string operator "" _i18n(const char*, std::size_t); // ok
-// FIXME: This should be accepted once we support UCNs
-template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-error {{expected identifier}}
+template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-warning {{reserved}}
float operator ""E(const char *); // expected-error {{invalid suffix on literal}} expected-warning {{reserved}}
float operator " " B(const char *); // expected-error {{must be '""'}} expected-warning {{reserved}}
string operator "" 5X(const char *, std::size_t); // expected-error {{expected identifier}}
diff --git a/test/CodeGen/ucn-identifiers.c b/test/CodeGen/ucn-identifiers.c
new file mode 100644
index 0000000..56e3aa5
--- /dev/null
+++ b/test/CodeGen/ucn-identifiers.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 %s -emit-llvm -o /dev/null
+// RUN: %clang_cc1 %s -emit-llvm -o /dev/null -x c++
+// This file contains UTF-8; please do not fix!
+
+
+extern void \u00FCber(int);
+extern void \U000000FCber(int); // redeclaration, no warning
+
+void goodCalls() {
+ \u00FCber(0);
+ \u00fcber(1);
+ über(2);
+ \U000000FCber(3);
+}
diff --git a/test/FixIt/fixit-unicode.c b/test/FixIt/fixit-unicode.c
index 2af5e08..c45ba06 100644
--- a/test/FixIt/fixit-unicode.c
+++ b/test/FixIt/fixit-unicode.c
@@ -8,13 +8,15 @@
// PR13312
void test1() {
struct Foo foo;
- (&foo)☃>bar = 42;
+ foo.bar = 42☃
+// CHECK: error: non-ASCII characters are not allowed outside of literals and identifiers
+// CHECK: {{^ \^}}
// CHECK: error: expected ';' after expression
// Make sure we emit the fixit right in front of the snowman.
-// CHECK: {{^ \^}}
-// CHECK: {{^ ;}}
+// CHECK: {{^ \^}}
+// CHECK: {{^ ;}}
-// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{11:9-11:9}:";"
+// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{[[@LINE-8]]:15-[[@LINE-8]]:15}:";"
}
@@ -29,5 +31,5 @@
// because different systems will render the delta differently (either as a
// character, or as <U+2206>.) The fixit should line up with the %d regardless.
-// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{23:16-23:18}:"%ld"
+// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{[[@LINE-9]]:16-[[@LINE-9]]:18}:"%ld"
}
diff --git a/test/Lexer/utf8-invalid.c b/test/Lexer/utf8-invalid.c
new file mode 100644
index 0000000..c4dd318
--- /dev/null
+++ b/test/Lexer/utf8-invalid.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// Note: this file contains invalid UTF-8 before the variable name in the
+// next line. Please do not fix!
+
+extern int x; // expected-error{{source file is not valid UTF-8}}
diff --git a/test/Preprocessor/ucn-pp-identifier.c b/test/Preprocessor/ucn-pp-identifier.c
new file mode 100644
index 0000000..f4afa91
--- /dev/null
+++ b/test/Preprocessor/ucn-pp-identifier.c
@@ -0,0 +1,97 @@
+// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify -Wundef
+// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef
+
+#define \u00FC
+#define a\u00FD() 0
+#ifndef \u00FC
+#error "This should never happen"
+#endif
+
+#if a\u00FD()
+#error "This should never happen"
+#endif
+
+#if a\U000000FD()
+#error "This should never happen"
+#endif
+
+#if \uarecool // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+#if \uwerecool // expected-warning{{\u used with no following hex digits; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+#if \U0001000 // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
+#endif
+
+// Make sure we reject disallowed UCNs
+#define \ufffe // expected-error {{macro names must be identifiers}}
+#define \U10000000 // expected-error {{macro names must be identifiers}}
+#define \u0061 // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro names must be identifiers}}
+
+// FIXME: Not clear what our behavior should be here; \u0024 is "$".
+#define a\u0024 // expected-warning {{whitespace}}
+
+#if \u0110 // expected-warning {{is not defined, evaluates to 0}}
+#endif
+
+
+#define \u0110 1 / 0
+#if \u0110 // expected-error {{division by zero in preprocessor expression}}
+#endif
+
+#define STRINGIZE(X) # X
+
+extern int check_size[sizeof(STRINGIZE(\u0112)) == 3 ? 1 : -1];
+
+// Check that we still diagnose disallowed UCNs in #if 0 blocks.
+// C99 5.1.1.2p1 and C++11 [lex.phases]p1 dictate that preprocessor tokens are
+// formed before directives are parsed.
+// expected-error@+4 {{character 'a' cannot be specified by a universal character name}}
+#if 0
+#define \ufffe // okay
+#define \U10000000 // okay
+#define \u0061 // error, but -verify only looks at comments outside #if 0
+#endif
+
+
+// A UCN formed by token pasting is undefined in both C99 and C++.
+// Right now we don't do anything special, which causes us to coincidentally
+// accept the first case below but reject the second two.
+#define PASTE(A, B) A ## B
+extern int PASTE(\, u00FD);
+extern int PASTE(\u, 00FD); // expected-warning{{\u used with no following hex digits}}
+extern int PASTE(\u0, 0FD); // expected-warning{{incomplete universal character name}}
+#ifdef __cplusplus
+// expected-error@-3 {{expected unqualified-id}}
+// expected-error@-3 {{expected unqualified-id}}
+#else
+// expected-error@-6 {{expected identifier}}
+// expected-error@-6 {{expected identifier}}
+#endif
+
+
+// A UCN produced by line splicing is valid in C99 but undefined in C++.
+// Since undefined behavior can do anything including working as intended,
+// we just accept it in C++ as well.;
+#define newline_1_\u00F\
+C 1
+#define newline_2_\u00\
+F\
+C 1
+#define newline_3_\u\
+00\
+FC 1
+#define newline_4_\\
+u00FC 1
+#define newline_5_\\
+u\
+\
+0\
+0\
+F\
+C 1
+
+#if (newline_1_\u00FC && newline_2_\u00FC && newline_3_\u00FC && \
+ newline_4_\u00FC && newline_5_\u00FC)
+#else
+#error "Line splicing failed to produce UCNs"
+#endif
diff --git a/test/Sema/ucn-identifiers.c b/test/Sema/ucn-identifiers.c
new file mode 100644
index 0000000..6b26365
--- /dev/null
+++ b/test/Sema/ucn-identifiers.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only -pedantic
+// RUN: %clang_cc1 %s -verify -fsyntax-only -x c++ -pedantic
+
+// This file contains UTF-8; please do not fix!
+
+
+extern void \u00FCber(int);
+extern void \U000000FCber(int); // redeclaration, no warning
+#ifdef __cplusplus
+// expected-note@-2 + {{candidate function not viable}}
+#else
+// expected-note@-4 + {{declared here}}
+#endif
+
+void goodCalls() {
+ \u00FCber(0);
+ \u00fcber(1);
+ über(2);
+ \U000000FCber(3);
+}
+
+void badCalls() {
+ \u00FCber(0.5); // expected-warning{{implicit conversion from 'double' to 'int'}}
+ \u00fcber = 0; // expected-error{{non-object type 'void (int)' is not assignable}}
+
+ über(1, 2);
+ \U000000FCber();
+#ifdef __cplusplus
+ // expected-error@-3 {{no matching function}}
+ // expected-error@-3 {{no matching function}}
+#else
+ // expected-error@-6 {{too many arguments to function call, expected 1, have 2}}
+ // expected-error@-6 {{too few arguments to function call, expected 1, have 0}}
+#endif
+}