Add support for C++0x unicode string and character literals, from Craig Topper!


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@136210 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index f8a2a55..8249340 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -28,12 +28,31 @@
   return -1;
 }
 
+static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
+  switch (kind) {
+  default: assert(0 && "Unknown token type!");
+  case tok::char_constant:
+  case tok::string_literal:
+  case tok::utf8_string_literal:
+    return Target.getCharWidth();
+  case tok::wide_char_constant:
+  case tok::wide_string_literal:
+    return Target.getWCharWidth();
+  case tok::utf16_char_constant:
+  case tok::utf16_string_literal:
+    return Target.getChar16Width();
+  case tok::utf32_char_constant:
+  case tok::utf32_string_literal:
+    return Target.getChar32Width();
+  }
+}
+
 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
 /// either a character or a string literal.
 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
                                   const char *ThisTokEnd, bool &HadError,
-                                  FullSourceLoc Loc, bool IsWide,
-                                  Diagnostic *Diags, const TargetInfo &Target) {
+                                  FullSourceLoc Loc, unsigned CharWidth,
+                                  Diagnostic *Diags) {
   // Skip the '\' char.
   ++ThisTokBuf;
 
@@ -98,9 +117,6 @@
     }
 
     // See if any bits will be truncated when evaluated as a character.
-    unsigned CharWidth =
-      IsWide ? Target.getWCharWidth() : Target.getCharWidth();
-
     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
       Overflow = true;
       ResultChar &= ~0U >> (32-CharWidth);
@@ -128,9 +144,6 @@
              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
 
     // Check for overflow.  Reject '\777', but not L'\777'.
-    unsigned CharWidth =
-      IsWide ? Target.getWCharWidth() : Target.getCharWidth();
-
     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
       if (Diags)
         Diags->Report(Loc, diag::warn_octal_escape_too_large);
@@ -219,8 +232,8 @@
 /// we will likely rework our support for UCN's.
 static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
                             char *&ResultBuf, bool &HadError,
-                            FullSourceLoc Loc, bool wide, Diagnostic *Diags, 
-                            const LangOptions &Features) {
+                            FullSourceLoc Loc, unsigned CharByteWidth,
+                            Diagnostic *Diags, const LangOptions &Features) {
   typedef uint32_t UTF32;
   UTF32 UcnVal = 0;
   unsigned short UcnLen = 0;
@@ -230,19 +243,22 @@
     return;
   }
 
-  if (wide) {
-    (void)UcnLen;
-    assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
+  assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
+         "only character widths of 1, 2, or 4 bytes supported");
 
-    if (!Features.ShortWChar) {
-      // Note: our internal rep of wide char tokens is always little-endian.
-      *ResultBuf++ = (UcnVal & 0x000000FF);
-      *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
-      *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
-      *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
-      return;
-    }
+  (void)UcnLen;
+  assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
 
+  if (CharByteWidth == 4) {
+    // Note: our internal rep of wide char tokens is always little-endian.
+    *ResultBuf++ = (UcnVal & 0x000000FF);
+    *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
+    *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
+    *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
+    return;
+  }
+
+  if (CharByteWidth == 2) {
     // Convert to UTF16.
     if (UcnVal < (UTF32)0xFFFF) {
       *ResultBuf++ = (UcnVal & 0x000000FF);
@@ -261,6 +277,9 @@
     *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
     return;
   }
+
+  assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
+
   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
   // The conversion below was inspired by:
   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
@@ -695,13 +714,18 @@
 
 
 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
-                                     SourceLocation Loc, Preprocessor &PP) {
+                                     SourceLocation Loc, Preprocessor &PP,
+                                     tok::TokenKind kind) {
   // At this point we know that the character matches the regex "L?'.*'".
   HadError = false;
 
-  // Determine if this is a wide character.
-  IsWide = begin[0] == 'L';
-  if (IsWide) ++begin;
+  Kind = kind;
+
+  // Determine if this is a wide or UTF character.
+  if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
+      Kind == tok::utf32_char_constant) {
+    ++begin;
+  }
 
   // Skip over the entry quote.
   assert(begin[0] == '\'' && "Invalid token lexed");
@@ -742,17 +766,17 @@
         ResultChar = utf32;
       } else {
         // Otherwise, this is a non-UCN escape character.  Process it.
+        unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
         ResultChar = ProcessCharEscape(begin, end, HadError,
                                        FullSourceLoc(Loc,PP.getSourceManager()),
-                                       IsWide,
-                                       &PP.getDiagnostics(), PP.getTargetInfo());
+                                       CharWidth, &PP.getDiagnostics());
       }
     }
 
     // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
     // implementation defined (C99 6.4.4.4p10).
     if (NumCharsSoFar) {
-      if (IsWide) {
+      if (!isAscii()) {
         // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
         LitVal = 0;
       } else {
@@ -774,8 +798,8 @@
   if (NumCharsSoFar > 1) {
     // Warn about discarding the top bits for multi-char wide-character
     // constants (L'abcd').
-    if (IsWide)
-      PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
+    if (!isAscii())
+      PP.Diag(Loc, diag::warn_extraneous_char_constant);
     else if (NumCharsSoFar != 4)
       PP.Diag(Loc, diag::ext_multichar_character_literal);
     else
@@ -787,14 +811,15 @@
   // Transfer the value from APInt to uint64_t
   Value = LitVal.getZExtValue();
 
-  if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
+  if (((isWide() && PP.getLangOptions().ShortWChar) || isUTF16()) &&
+      Value > 0xFFFF)
     PP.Diag(Loc, diag::warn_ucn_escape_too_large);
 
   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
   // character constants are not sign extended in the this implementation:
   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
-  if (!IsWide && NumCharsSoFar == 1 && (Value & 128) &&
+  if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
       PP.getLangOptions().CharIsSigned)
     Value = (signed char)Value;
 }
@@ -839,8 +864,8 @@
                     Preprocessor &PP, bool Complain)
   : SM(PP.getSourceManager()), Features(PP.getLangOptions()),
     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
-    MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0),
-    ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) {
+    MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
+    ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
   init(StringToks, NumStringToks);
 }
 
@@ -860,7 +885,7 @@
   MaxTokenLength = StringToks[0].getLength();
   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
-  AnyWide = StringToks[0].is(tok::wide_string_literal);
+  Kind = StringToks[0].getKind();
 
   hadError = false;
 
@@ -881,8 +906,18 @@
     if (StringToks[i].getLength() > MaxTokenLength)
       MaxTokenLength = StringToks[i].getLength();
 
-    // Remember if we see any wide strings.
-    AnyWide |= StringToks[i].is(tok::wide_string_literal);
+    // Remember if we see any wide or utf-8/16/32 strings.
+    // Also check for illegal concatenations.
+    if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
+      if (isAscii()) {
+        Kind = StringToks[i].getKind();
+      } else {
+        if (Diags)
+          Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
+                        diag::err_unsupported_string_concat);
+        hadError = true;
+      }
+    }
   }
 
   // Include space for the null terminator.
@@ -890,19 +925,14 @@
 
   // TODO: K&R warning: "traditional C rejects string constant concatenation"
 
-  // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
-  // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
-  wchar_tByteWidth = ~0U;
-  if (AnyWide) {
-    wchar_tByteWidth = Target.getWCharWidth();
-    assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
-    wchar_tByteWidth /= 8;
-  }
+  // Get the width in bytes of char/wchar_t/char16_t/char32_t
+  CharByteWidth = getCharWidth(Kind, Target);
+  assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
+  CharByteWidth /= 8;
 
   // The output buffer size needs to be large enough to hold wide characters.
   // This is a worst-case assumption which basically corresponds to L"" "long".
-  if (AnyWide)
-    SizeBound *= wchar_tByteWidth;
+  SizeBound *= CharByteWidth;
 
   // Size the temporary buffer to hold the result string data.
   ResultBuf.resize(SizeBound);
@@ -927,18 +957,19 @@
       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
                          &StringInvalid);
     if (StringInvalid) {
-      hadError = 1;
+      hadError = true;
       continue;
     }
 
     const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
-    bool wide = false;
     // TODO: Input character set mapping support.
 
     // Skip L marker for wide strings.
-    if (ThisTokBuf[0] == 'L') {
-      wide = true;
+    if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
       ++ThisTokBuf;
+      // Skip 8 of u8 marker for utf8 strings.
+      if (ThisTokBuf[0] == '8')
+        ++ThisTokBuf;
     }
 
     assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
@@ -967,7 +998,7 @@
 
         // Copy the character span over.
         unsigned Len = ThisTokBuf-InStart;
-        if (!AnyWide) {
+        if (CharByteWidth == 1) {
           memcpy(ResultPtr, InStart, Len);
           ResultPtr += Len;
         } else {
@@ -975,7 +1006,7 @@
           for (; Len; --Len, ++InStart) {
             *ResultPtr++ = InStart[0];
             // Add zeros at the end.
-            for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+            for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
               *ResultPtr++ = 0;
           }
         }
@@ -985,29 +1016,26 @@
       if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
         EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
                         hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
-                        wide, Diags, Features);
+                        CharByteWidth, Diags, Features);
         continue;
       }
       // Otherwise, this is a non-UCN escape character.  Process it.
       unsigned ResultChar =
         ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
                           FullSourceLoc(StringToks[i].getLocation(), SM),
-                          AnyWide, Diags, Target);
+                          CharByteWidth*8, Diags);
 
       // Note: our internal rep of wide char tokens is always little-endian.
       *ResultPtr++ = ResultChar & 0xFF;
 
-      if (AnyWide) {
-        for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-          *ResultPtr++ = ResultChar >> i*8;
-      }
+      for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
+        *ResultPtr++ = ResultChar >> i*8;
     }
   }
 
   if (Pascal) {
     ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
-    if (AnyWide)
-      ResultBuf[0] /= wchar_tByteWidth;
+    ResultBuf[0] /= CharByteWidth;
 
     // Verify that pascal strings aren't too large.
     if (GetStringLength() > 256) {
@@ -1016,7 +1044,7 @@
                       diag::err_pascal_string_too_long)
           << SourceRange(StringToks[0].getLocation(),
                          StringToks[NumStringToks-1].getLocation());
-      hadError = 1;
+      hadError = true;
       return;
     }
   } else if (Diags) {
@@ -1050,7 +1078,8 @@
   if (StringInvalid)
     return 0;
 
-  assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet");
+  assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
+         SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
 
 
   const char *SpellingStart = SpellingPtr;
@@ -1075,7 +1104,7 @@
     bool HadError = false;
     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
                       FullSourceLoc(Tok.getLocation(), SM),
-                      false, Diags, Target);
+                      CharByteWidth*8, Diags);
     assert(!HadError && "This method isn't valid on erroneous strings");
     --ByteNo;
   }