Add support for UCNs for character literals

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@116129 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index 9b7c46f..3b739b3 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -164,13 +164,10 @@
 }
 
 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
-/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
-/// When we decide to implement UCN's for character constants and identifiers,
-/// we will likely rework our support for UCN's.
-static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
-                             char *&ResultBuf, bool &HadError,
+/// return the UTF32.
+static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+                             uint32_t &UcnVal, unsigned short &UcnLen,
                              SourceLocation Loc, Preprocessor &PP,
-                             bool wide,
                              bool Complain) {
   if (!PP.getLangOptions().CPlusPlus && !PP.getLangOptions().C99)
     PP.Diag(Loc, diag::warn_ucn_not_valid_in_c89);
@@ -184,27 +181,22 @@
   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
     if (Complain)
       PP.Diag(Loc, diag::err_ucn_escape_no_digits);
-    HadError = 1;
-    return;
+    return false;
   }
-  typedef uint32_t UTF32;
-
-  UTF32 UcnVal = 0;
-  unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
+  UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
   unsigned short UcnLenSave = UcnLen;
-  for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
+  for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
     int CharVal = HexDigitValue(ThisTokBuf[0]);
     if (CharVal == -1) break;
     UcnVal <<= 4;
     UcnVal |= CharVal;
   }
   // If we didn't consume the proper number of digits, there is a problem.
-  if (UcnLen) {
+  if (UcnLenSave) {
     if (Complain)
       PP.Diag(PP.AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin),
               diag::err_ucn_escape_incomplete);
-    HadError = 1;
-    return;
+    return false;
   }
   // Check UCN constraints (C99 6.4.3p2).
   if ((UcnVal < 0xa0 &&
@@ -213,13 +205,33 @@
       || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
     if (Complain)
       PP.Diag(Loc, diag::err_ucn_escape_invalid);
+    return false;
+  }
+  return true;
+}
+
+/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
+/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
+/// StringLiteralParser. When we decide to implement UCN's for identifiers,
+/// we will likely rework our support for UCN's.
+static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+                             char *&ResultBuf, bool &HadError,
+                             SourceLocation Loc, Preprocessor &PP,
+                             bool wide,
+                             bool Complain) {
+  typedef uint32_t UTF32;
+  UTF32 UcnVal = 0;
+  unsigned short UcnLen = 0;
+  if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd,
+                        UcnVal, UcnLen, Loc, PP, Complain)) {
     HadError = 1;
     return;
   }
+
   if (wide) {
-    (void)UcnLenSave;
-    assert((UcnLenSave == 4 || UcnLenSave == 8) && 
-           "ProcessUCNEscape - only ucn length of 4 or 8 supported");
+    (void)UcnLen;
+    assert((UcnLen== 4 || UcnLen== 8) && 
+           "EncodeUCNEscape - only ucn length of 4 or 8 supported");
 
     if (!PP.getLangOptions().ShortWChar) {
       // Note: our internal rep of wide char tokens is always little-endian.
@@ -702,11 +714,26 @@
   bool Warned = false;
   while (begin[0] != '\'') {
     uint64_t ResultChar;
+
+      // Is this a Universal Character Name escape?
     if (begin[0] != '\\')     // If this is a normal character, consume it.
       ResultChar = *begin++;
-    else                      // Otherwise, this is an escape character.
-      ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP,
-                                     /*Complain=*/true);
+    else {                    // Otherwise, this is an escape character.
+      // Check for UCN.
+      if (begin[1] == 'u' || begin[1] == 'U') {
+        uint32_t utf32 = 0;
+        unsigned short UcnLen = 0;
+        if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
+                              Loc, PP, /*Complain=*/true)) {
+          HadError = 1;
+        }
+        ResultChar = utf32;
+      } else {
+        // Otherwise, this is a non-UCN escape character.  Process it.
+        ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP,
+                                       /*Complain=*/true);
+      }
+    }
 
     // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
     // implementation defined (C99 6.4.4.4p10).
@@ -746,6 +773,9 @@
   // Transfer the value from APInt to uint64_t
   Value = LitVal.getZExtValue();
 
+  if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
+    PP.Diag(Loc, diag::warn_ucn_escape_too_large);
+
   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
   // character constants are not sign extended in the this implementation:
@@ -915,9 +945,9 @@
       }
       // Is this a Universal Character Name escape?
       if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
-        ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
-                         hadError, StringToks[i].getLocation(), PP, wide, 
-                         Complain);
+        EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
+                        hadError, StringToks[i].getLocation(), PP, wide, 
+                        Complain);
         continue;
       }
       // Otherwise, this is a non-UCN escape character.  Process it.