Implement UCN support for C string literals (C99 6.4.3) and add some very basic tests. Chris Goller has graciously offered to write some test to help validate UCN support.
From a front-end perspective, I believe this code should work for ObjC @-strings. At the moment, I believe we need to tweak the code generation for @-strings (which doesn't appear to handle them). Will be investigating.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@68076 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index c20383f..dcd239d 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -71,8 +71,6 @@
case 'v':
ResultChar = 11;
break;
-
- //case 'u': case 'U': // FIXME: UCNs.
case 'x': { // Hex escape.
ResultChar = 0;
if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
@@ -151,7 +149,90 @@
return ResultChar;
}
+/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
+/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
+/// When we decide to implement UCN's for character constants and identifiers,
+/// we will likely rework our support for UCN's.
+static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+ char *&ResultBuf, const char *ResultBufEnd,
+ bool &HadError,
+ SourceLocation Loc, Preprocessor &PP) {
+ // FIXME: Add a warning - UCN's are only valid in C++ & C99.
+
+ // Skip the '\u' char's.
+ ThisTokBuf += 2;
+ if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
+ PP.Diag(Loc, diag::err_ucn_escape_no_digits);
+ HadError = 1;
+ return;
+ }
+ typedef unsigned int UTF32;
+
+ UTF32 UcnVal = 0;
+ unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
+ for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
+ int CharVal = HexDigitValue(ThisTokBuf[0]);
+ if (CharVal == -1) break;
+ UcnVal <<= 4;
+ UcnVal |= CharVal;
+ }
+ // If we didn't consume the proper number of digits, there is a problem.
+ if (UcnLen) {
+ PP.Diag(Loc, diag::err_ucn_escape_incomplete);
+ HadError = 1;
+ return;
+ }
+ // Check UCN constraints (C99 6.4.3p2)
+ if ((UcnVal < 0xa0 &&
+ (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
+ || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)) {
+ PP.Diag(Loc, diag::err_ucn_escape_invalid);
+ HadError = 1;
+ return;
+ }
+ // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
+ // The conversion below was inspired by:
+ // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
+ // First, we determine how many bytes the result will require.
+ typedef unsigned char UTF8;
+
+ unsigned short bytesToWrite = 0;
+ if (UcnVal < (UTF32)0x80)
+ bytesToWrite = 1;
+ else if (UcnVal < (UTF32)0x800)
+ bytesToWrite = 2;
+ else if (UcnVal < (UTF32)0x10000)
+ bytesToWrite = 3;
+ else
+ bytesToWrite = 4;
+
+ // If the buffer isn't big enough, bail.
+ if ((ResultBuf + bytesToWrite) >= ResultBufEnd) {
+ PP.Diag(Loc, diag::err_ucn_escape_too_big);
+ HadError = 1;
+ return;
+ }
+ const unsigned byteMask = 0xBF;
+ const unsigned byteMark = 0x80;
+
+ // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
+ // into the first byte, depending on how many bytes follow. There are
+ // as many entries in this table as there are UTF8 sequence types.
+ static const UTF8 firstByteMark[7] = {
+ 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
+ };
+ // Finally, we write the bytes into ResultBuf.
+ ResultBuf += bytesToWrite;
+ switch (bytesToWrite) { // note: everything falls through.
+ case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+ case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+ case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
+ case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
+ }
+ // Update the buffer.
+ ResultBuf += bytesToWrite;
+}
/// integer-constant: [C99 6.4.4.1]
@@ -757,23 +838,29 @@
*ResultPtr++ = InStart[0];
// Add zeros at the end.
for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
- *ResultPtr++ = 0;
+ *ResultPtr++ = 0;
}
}
continue;
}
- // Otherwise, this is an escape character. Process it.
- unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
- StringToks[i].getLocation(),
- ThisIsWide, PP);
-
- // Note: our internal rep of wide char tokens is always little-endian.
- *ResultPtr++ = ResultChar & 0xFF;
-
- if (AnyWide) {
- for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
- *ResultPtr++ = ResultChar >> i*8;
+ if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
+ ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
+ GetString() + ResultBuf.size(),
+ hadError, StringToks[i].getLocation(), PP);
+ } else {
+ // Otherwise, this is a non-UCN escape character. Process it.
+ unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+ StringToks[i].getLocation(),
+ ThisIsWide, PP);
+
+ // Note: our internal rep of wide char tokens is always little-endian.
+ *ResultPtr++ = ResultChar & 0xFF;
+
+ if (AnyWide) {
+ for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+ *ResultPtr++ = ResultChar >> i*8;
+ }
}
}
}