Add support for C++0x unicode string and character literals, from Craig Topper! git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@136210 91177308-0d34-0410-b5e6-96231b3b80d8

commit: 5cee1195584fa8672253139c86e922daeda69b9e [log] [tgz]
author: Douglas Gregor <dgregor@apple.com> Wed Jul 27 05:40:30 2011 +0000
committer: Douglas Gregor <dgregor@apple.com> Wed Jul 27 05:40:30 2011 +0000
tree: e1b36e0f628359bb42d22d78c74e931057b962de
parent: 6fa8f86b8188c6d3c4d6616122a71ccd72a0c78a [diff]
diff --git a/lib/AST/ASTImporter.cpp b/lib/AST/ASTImporter.cpp
index 2ea7991..d6e7d77 100644
--- a/lib/AST/ASTImporter.cpp
+++ b/lib/AST/ASTImporter.cpp

@@ -3814,8 +3814,8 @@
   if (T.isNull())
     return 0;
   
-  return new (Importer.getToContext()) CharacterLiteral(E->getValue(), 
-                                                        E->isWide(), T,
+  return new (Importer.getToContext()) CharacterLiteral(E->getValue(),
+                                                        E->getKind(), T,
                                           Importer.Import(E->getLocation()));
 }
 

diff --git a/lib/AST/Expr.cpp b/lib/AST/Expr.cpp
index 58fb32d..5e795be 100644
--- a/lib/AST/Expr.cpp
+++ b/lib/AST/Expr.cpp

@@ -533,8 +533,7 @@
 }
 
 StringLiteral *StringLiteral::Create(ASTContext &C, StringRef Str,
-                                     bool Wide,
-                                     bool Pascal, QualType Ty,
+                                     StringKind Kind, bool Pascal, QualType Ty,
                                      const SourceLocation *Loc,
                                      unsigned NumStrs) {
   // Allocate enough space for the StringLiteral plus an array of locations for
@@ -549,7 +548,7 @@
   memcpy(AStrData, Str.data(), Str.size());
   SL->StrData = AStrData;
   SL->ByteLength = Str.size();
-  SL->IsWide = Wide;
+  SL->Kind = Kind;
   SL->IsPascal = Pascal;
   SL->TokLocs[0] = Loc[0];
   SL->NumConcatenated = NumStrs;
@@ -587,8 +586,8 @@
 SourceLocation StringLiteral::
 getLocationOfByte(unsigned ByteNo, const SourceManager &SM,
                   const LangOptions &Features, const TargetInfo &Target) const {
-  assert(!isWide() && "This doesn't work for wide strings yet");
-  
+  assert(Kind == StringLiteral::Ascii && "This only works for ASCII strings");
+
   // Loop over all of the tokens in this string until we find the one that
   // contains the byte we're looking for.
   unsigned TokNo = 0;

diff --git a/lib/AST/StmtDumper.cpp b/lib/AST/StmtDumper.cpp
index 7218af5..ce4ae8e 100644
--- a/lib/AST/StmtDumper.cpp
+++ b/lib/AST/StmtDumper.cpp

@@ -443,8 +443,13 @@
   DumpExpr(Str);
   // FIXME: this doesn't print wstrings right.
   OS << " ";
-  if (Str->isWide())
-    OS << "L";
+  switch (Str->getKind()) {
+  case StringLiteral::Ascii: break; // No prefix
+  case StringLiteral::Wide:  OS << 'L'; break;
+  case StringLiteral::UTF8:  OS << "u8"; break;
+  case StringLiteral::UTF16: OS << 'u'; break;
+  case StringLiteral::UTF32: OS << 'U'; break;
+  }
   OS << '"';
   OS.write_escaped(Str->getString());
   OS << '"';

diff --git a/lib/AST/StmtPrinter.cpp b/lib/AST/StmtPrinter.cpp
index 8fcad14..79f14bc 100644
--- a/lib/AST/StmtPrinter.cpp
+++ b/lib/AST/StmtPrinter.cpp

@@ -599,8 +599,14 @@
 
 void StmtPrinter::VisitCharacterLiteral(CharacterLiteral *Node) {
   unsigned value = Node->getValue();
-  if (Node->isWide())
-    OS << "L";
+
+  switch (Node->getKind()) {
+  case CharacterLiteral::Ascii: break; // no prefix.
+  case CharacterLiteral::Wide:  OS << 'L'; break;
+  case CharacterLiteral::UTF16: OS << 'u'; break;
+  case CharacterLiteral::UTF32: OS << 'U'; break;
+  }
+
   switch (value) {
   case '\\':
     OS << "'\\\\'";
@@ -672,7 +678,13 @@
 }
 
 void StmtPrinter::VisitStringLiteral(StringLiteral *Str) {
-  if (Str->isWide()) OS << 'L';
+  switch (Str->getKind()) {
+  case StringLiteral::Ascii: break; // no prefix.
+  case StringLiteral::Wide:  OS << 'L'; break;
+  case StringLiteral::UTF8:  OS << "u8"; break;
+  case StringLiteral::UTF16: OS << 'u'; break;
+  case StringLiteral::UTF32: OS << 'U'; break;
+  }
   OS << '"';
 
   // FIXME: this doesn't print wstrings right.

diff --git a/lib/AST/StmtProfile.cpp b/lib/AST/StmtProfile.cpp
index 120c9e5..12321ef 100644
--- a/lib/AST/StmtProfile.cpp
+++ b/lib/AST/StmtProfile.cpp

@@ -252,7 +252,7 @@
 
 void StmtProfiler::VisitCharacterLiteral(const CharacterLiteral *S) {
   VisitExpr(S);
-  ID.AddBoolean(S->isWide());
+  ID.AddInteger(S->getKind());
   ID.AddInteger(S->getValue());
 }
 
@@ -269,7 +269,7 @@
 void StmtProfiler::VisitStringLiteral(const StringLiteral *S) {
   VisitExpr(S);
   ID.AddString(S->getString());
-  ID.AddBoolean(S->isWide());
+  ID.AddInteger(S->getKind());
 }
 
 void StmtProfiler::VisitParenExpr(const ParenExpr *S) {

diff --git a/lib/AST/Type.cpp b/lib/AST/Type.cpp
index 7cd3be2..2555ab3 100644
--- a/lib/AST/Type.cpp
+++ b/lib/AST/Type.cpp

@@ -635,6 +635,18 @@
   return false;
 }
 
+bool Type::isChar16Type() const {
+  if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
+    return BT->getKind() == BuiltinType::Char16;
+  return false;
+}
+
+bool Type::isChar32Type() const {
+  if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
+    return BT->getKind() == BuiltinType::Char32;
+  return false;
+}
+
 /// \brief Determine whether this type is any of the built-in character
 /// types.
 bool Type::isAnyCharacterType() const {

diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index 290fe24..ce32325 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp

@@ -1877,8 +1877,20 @@
   // Resize the string to the right size.
   uint64_t RealLen = CAT->getSize().getZExtValue();
 
-  if (E->isWide())
+  switch (E->getKind()) {
+  case StringLiteral::Ascii:
+  case StringLiteral::UTF8:
+    break;
+  case StringLiteral::Wide:
     RealLen *= Context.Target.getWCharWidth() / Context.getCharWidth();
+    break;
+  case StringLiteral::UTF16:
+    RealLen *= Context.Target.getChar16Width() / Context.getCharWidth();
+    break;
+  case StringLiteral::UTF32:
+    RealLen *= Context.Target.getChar32Width() / Context.getCharWidth();
+    break;
+  }
 
   std::string Str = E->getString().str();
   Str.resize(RealLen, '\0');
@@ -1893,7 +1905,7 @@
   // FIXME: This can be more efficient.
   // FIXME: We shouldn't need to bitcast the constant in the wide string case.
   llvm::Constant *C = GetAddrOfConstantString(GetStringForStringLiteral(S));
-  if (S->isWide()) {
+  if (S->isWide() || S->isUTF16() || S->isUTF32()) {
     llvm::Type *DestTy =
         llvm::PointerType::getUnqual(getTypes().ConvertType(S->getType()));
     C = llvm::ConstantExpr::getBitCast(C, DestTy);

diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 6c7169f..44674a9 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp

@@ -1267,8 +1267,9 @@
 }
 
 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
-/// either " or L".
-void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
+/// either " or L" or u8" or u" or U".
+void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
+                             tok::TokenKind Kind) {
   const char *NulCharacter = 0; // Does this string contain the \0 character?
 
   char C = getAndAdvanceChar(CurPtr, Result);
@@ -1299,8 +1300,7 @@
 
   // Update the location of the token as well as the BufferPtr instance var.
   const char *TokStart = BufferPtr;
-  FormTokenWithChars(Result, CurPtr,
-                     Wide ? tok::wide_string_literal : tok::string_literal);
+  FormTokenWithChars(Result, CurPtr, Kind);
   Result.setLiteralData(TokStart);
 }
 
@@ -1339,8 +1339,9 @@
 
 
 /// LexCharConstant - Lex the remainder of a character constant, after having
-/// lexed either ' or L'.
-void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
+/// lexed either ' or L' or u' or U'.
+void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
+                            tok::TokenKind Kind) {
   const char *NulCharacter = 0; // Does this character contain the \0 character?
 
   char C = getAndAdvanceChar(CurPtr, Result);
@@ -1377,7 +1378,7 @@
 
   // Update the location of token as well as BufferPtr.
   const char *TokStart = BufferPtr;
-  FormTokenWithChars(Result, CurPtr, tok::char_constant);
+  FormTokenWithChars(Result, CurPtr, Kind);
   Result.setLiteralData(TokStart);
 }
 
@@ -2185,6 +2186,55 @@
     MIOpt.ReadToken();
     return LexNumericConstant(Result, CurPtr);
 
+  case 'u':   // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal
+    // Notify MIOpt that we read a non-whitespace/non-comment token.
+    MIOpt.ReadToken();
+
+    if (Features.CPlusPlus0x) {
+      Char = getCharAndSize(CurPtr, SizeTmp);
+
+      // UTF-16 string literal
+      if (Char == '"')
+        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+                                tok::utf16_string_literal);
+
+      // UTF-16 character constant
+      if (Char == '\'')
+        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+                               tok::utf16_char_constant);
+
+      // UTF-8 string literal
+      if (Char == '8' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+        return LexStringLiteral(Result,
+                              ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                          SizeTmp2, Result),
+                              tok::utf8_string_literal);
+    }
+
+    // treat u like the start of an identifier.
+    return LexIdentifier(Result, CurPtr);
+
+  case 'U':   // Identifier (Uber) or C++0x UTF-32 string literal
+    // Notify MIOpt that we read a non-whitespace/non-comment token.
+    MIOpt.ReadToken();
+
+    if (Features.CPlusPlus0x) {
+      Char = getCharAndSize(CurPtr, SizeTmp);
+
+      // UTF-32 string literal
+      if (Char == '"')
+        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+                                tok::utf32_string_literal);
+
+      // UTF-32 character constant
+      if (Char == '\'')
+        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+                               tok::utf32_char_constant);
+    }
+
+    // treat U like the start of an identifier.
+    return LexIdentifier(Result, CurPtr);
+
   case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
     // Notify MIOpt that we read a non-whitespace/non-comment token.
     MIOpt.ReadToken();
@@ -2193,21 +2243,22 @@
     // Wide string literal.
     if (Char == '"')
       return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
-                              true);
+                              tok::wide_string_literal);
 
     // Wide character constant.
     if (Char == '\'')
-      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
+      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+                             tok::wide_char_constant);
     // FALL THROUGH, treating L like the start of an identifier.
 
   // C99 6.4.2: Identifiers.
   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
   case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
-  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T':    /*'U'*/
   case 'V': case 'W': case 'X': case 'Y': case 'Z':
   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
   case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
-  case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+  case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
   case 'v': case 'w': case 'x': case 'y': case 'z':
   case '_':
     // Notify MIOpt that we read a non-whitespace/non-comment token.
@@ -2230,13 +2281,13 @@
   case '\'':
     // Notify MIOpt that we read a non-whitespace/non-comment token.
     MIOpt.ReadToken();
-    return LexCharConstant(Result, CurPtr);
+    return LexCharConstant(Result, CurPtr, tok::char_constant);
 
   // C99 6.4.5: String Literals.
   case '"':
     // Notify MIOpt that we read a non-whitespace/non-comment token.
     MIOpt.ReadToken();
-    return LexStringLiteral(Result, CurPtr, false);
+    return LexStringLiteral(Result, CurPtr, tok::string_literal);
 
   // C99 6.4.6: Punctuators.
   case '?':

diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index f8a2a55..8249340 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp

@@ -28,12 +28,31 @@
   return -1;
 }
 
+static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
+  switch (kind) {
+  default: assert(0 && "Unknown token type!");
+  case tok::char_constant:
+  case tok::string_literal:
+  case tok::utf8_string_literal:
+    return Target.getCharWidth();
+  case tok::wide_char_constant:
+  case tok::wide_string_literal:
+    return Target.getWCharWidth();
+  case tok::utf16_char_constant:
+  case tok::utf16_string_literal:
+    return Target.getChar16Width();
+  case tok::utf32_char_constant:
+  case tok::utf32_string_literal:
+    return Target.getChar32Width();
+  }
+}
+
 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
 /// either a character or a string literal.
 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
                                   const char *ThisTokEnd, bool &HadError,
-                                  FullSourceLoc Loc, bool IsWide,
-                                  Diagnostic *Diags, const TargetInfo &Target) {
+                                  FullSourceLoc Loc, unsigned CharWidth,
+                                  Diagnostic *Diags) {
   // Skip the '\' char.
   ++ThisTokBuf;
 
@@ -98,9 +117,6 @@
     }
 
     // See if any bits will be truncated when evaluated as a character.
-    unsigned CharWidth =
-      IsWide ? Target.getWCharWidth() : Target.getCharWidth();
-
     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
       Overflow = true;
       ResultChar &= ~0U >> (32-CharWidth);
@@ -128,9 +144,6 @@
              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
 
     // Check for overflow.  Reject '\777', but not L'\777'.
-    unsigned CharWidth =
-      IsWide ? Target.getWCharWidth() : Target.getCharWidth();
-
     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
       if (Diags)
         Diags->Report(Loc, diag::warn_octal_escape_too_large);
@@ -219,8 +232,8 @@
 /// we will likely rework our support for UCN's.
 static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
                             char *&ResultBuf, bool &HadError,
-                            FullSourceLoc Loc, bool wide, Diagnostic *Diags, 
-                            const LangOptions &Features) {
+                            FullSourceLoc Loc, unsigned CharByteWidth,
+                            Diagnostic *Diags, const LangOptions &Features) {
   typedef uint32_t UTF32;
   UTF32 UcnVal = 0;
   unsigned short UcnLen = 0;
@@ -230,19 +243,22 @@
     return;
   }
 
-  if (wide) {
-    (void)UcnLen;
-    assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
+  assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
+         "only character widths of 1, 2, or 4 bytes supported");
 
-    if (!Features.ShortWChar) {
-      // Note: our internal rep of wide char tokens is always little-endian.
-      *ResultBuf++ = (UcnVal & 0x000000FF);
-      *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
-      *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
-      *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
-      return;
-    }
+  (void)UcnLen;
+  assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
 
+  if (CharByteWidth == 4) {
+    // Note: our internal rep of wide char tokens is always little-endian.
+    *ResultBuf++ = (UcnVal & 0x000000FF);
+    *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
+    *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
+    *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
+    return;
+  }
+
+  if (CharByteWidth == 2) {
     // Convert to UTF16.
     if (UcnVal < (UTF32)0xFFFF) {
       *ResultBuf++ = (UcnVal & 0x000000FF);
@@ -261,6 +277,9 @@
     *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
     return;
   }
+
+  assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
+
   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
   // The conversion below was inspired by:
   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
@@ -695,13 +714,18 @@
 
 
 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
-                                     SourceLocation Loc, Preprocessor &PP) {
+                                     SourceLocation Loc, Preprocessor &PP,
+                                     tok::TokenKind kind) {
   // At this point we know that the character matches the regex "L?'.*'".
   HadError = false;
 
-  // Determine if this is a wide character.
-  IsWide = begin[0] == 'L';
-  if (IsWide) ++begin;
+  Kind = kind;
+
+  // Determine if this is a wide or UTF character.
+  if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
+      Kind == tok::utf32_char_constant) {
+    ++begin;
+  }
 
   // Skip over the entry quote.
   assert(begin[0] == '\'' && "Invalid token lexed");
@@ -742,17 +766,17 @@
         ResultChar = utf32;
       } else {
         // Otherwise, this is a non-UCN escape character.  Process it.
+        unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
         ResultChar = ProcessCharEscape(begin, end, HadError,
                                        FullSourceLoc(Loc,PP.getSourceManager()),
-                                       IsWide,
-                                       &PP.getDiagnostics(), PP.getTargetInfo());
+                                       CharWidth, &PP.getDiagnostics());
       }
     }
 
     // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
     // implementation defined (C99 6.4.4.4p10).
     if (NumCharsSoFar) {
-      if (IsWide) {
+      if (!isAscii()) {
         // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
         LitVal = 0;
       } else {
@@ -774,8 +798,8 @@
   if (NumCharsSoFar > 1) {
     // Warn about discarding the top bits for multi-char wide-character
     // constants (L'abcd').
-    if (IsWide)
-      PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
+    if (!isAscii())
+      PP.Diag(Loc, diag::warn_extraneous_char_constant);
     else if (NumCharsSoFar != 4)
       PP.Diag(Loc, diag::ext_multichar_character_literal);
     else
@@ -787,14 +811,15 @@
   // Transfer the value from APInt to uint64_t
   Value = LitVal.getZExtValue();
 
-  if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
+  if (((isWide() && PP.getLangOptions().ShortWChar) || isUTF16()) &&
+      Value > 0xFFFF)
     PP.Diag(Loc, diag::warn_ucn_escape_too_large);
 
   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
   // character constants are not sign extended in the this implementation:
   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
-  if (!IsWide && NumCharsSoFar == 1 && (Value & 128) &&
+  if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
       PP.getLangOptions().CharIsSigned)
     Value = (signed char)Value;
 }
@@ -839,8 +864,8 @@
                     Preprocessor &PP, bool Complain)
   : SM(PP.getSourceManager()), Features(PP.getLangOptions()),
     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
-    MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0),
-    ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) {
+    MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
+    ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
   init(StringToks, NumStringToks);
 }
 
@@ -860,7 +885,7 @@
   MaxTokenLength = StringToks[0].getLength();
   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
-  AnyWide = StringToks[0].is(tok::wide_string_literal);
+  Kind = StringToks[0].getKind();
 
   hadError = false;
 
@@ -881,8 +906,18 @@
     if (StringToks[i].getLength() > MaxTokenLength)
       MaxTokenLength = StringToks[i].getLength();
 
-    // Remember if we see any wide strings.
-    AnyWide |= StringToks[i].is(tok::wide_string_literal);
+    // Remember if we see any wide or utf-8/16/32 strings.
+    // Also check for illegal concatenations.
+    if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
+      if (isAscii()) {
+        Kind = StringToks[i].getKind();
+      } else {
+        if (Diags)
+          Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
+                        diag::err_unsupported_string_concat);
+        hadError = true;
+      }
+    }
   }
 
   // Include space for the null terminator.
@@ -890,19 +925,14 @@
 
   // TODO: K&R warning: "traditional C rejects string constant concatenation"
 
-  // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
-  // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
-  wchar_tByteWidth = ~0U;
-  if (AnyWide) {
-    wchar_tByteWidth = Target.getWCharWidth();
-    assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
-    wchar_tByteWidth /= 8;
-  }
+  // Get the width in bytes of char/wchar_t/char16_t/char32_t
+  CharByteWidth = getCharWidth(Kind, Target);
+  assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
+  CharByteWidth /= 8;
 
   // The output buffer size needs to be large enough to hold wide characters.
   // This is a worst-case assumption which basically corresponds to L"" "long".
-  if (AnyWide)
-    SizeBound *= wchar_tByteWidth;
+  SizeBound *= CharByteWidth;
 
   // Size the temporary buffer to hold the result string data.
   ResultBuf.resize(SizeBound);
@@ -927,18 +957,19 @@
       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
                          &StringInvalid);
     if (StringInvalid) {
-      hadError = 1;
+      hadError = true;
       continue;
     }
 
     const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
-    bool wide = false;
     // TODO: Input character set mapping support.
 
     // Skip L marker for wide strings.
-    if (ThisTokBuf[0] == 'L') {
-      wide = true;
+    if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
       ++ThisTokBuf;
+      // Skip 8 of u8 marker for utf8 strings.
+      if (ThisTokBuf[0] == '8')
+        ++ThisTokBuf;
     }
 
     assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
@@ -967,7 +998,7 @@
 
         // Copy the character span over.
         unsigned Len = ThisTokBuf-InStart;
-        if (!AnyWide) {
+        if (CharByteWidth == 1) {
           memcpy(ResultPtr, InStart, Len);
           ResultPtr += Len;
         } else {
@@ -975,7 +1006,7 @@
           for (; Len; --Len, ++InStart) {
             *ResultPtr++ = InStart[0];
             // Add zeros at the end.
-            for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+            for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
               *ResultPtr++ = 0;
           }
         }
@@ -985,29 +1016,26 @@
       if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
         EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
                         hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
-                        wide, Diags, Features);
+                        CharByteWidth, Diags, Features);
         continue;
       }
       // Otherwise, this is a non-UCN escape character.  Process it.
       unsigned ResultChar =
         ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
                           FullSourceLoc(StringToks[i].getLocation(), SM),
-                          AnyWide, Diags, Target);
+                          CharByteWidth*8, Diags);
 
       // Note: our internal rep of wide char tokens is always little-endian.
       *ResultPtr++ = ResultChar & 0xFF;
 
-      if (AnyWide) {
-        for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-          *ResultPtr++ = ResultChar >> i*8;
-      }
+      for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
+        *ResultPtr++ = ResultChar >> i*8;
     }
   }
 
   if (Pascal) {
     ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
-    if (AnyWide)
-      ResultBuf[0] /= wchar_tByteWidth;
+    ResultBuf[0] /= CharByteWidth;
 
     // Verify that pascal strings aren't too large.
     if (GetStringLength() > 256) {
@@ -1016,7 +1044,7 @@
                       diag::err_pascal_string_too_long)
           << SourceRange(StringToks[0].getLocation(),
                          StringToks[NumStringToks-1].getLocation());
-      hadError = 1;
+      hadError = true;
       return;
     }
   } else if (Diags) {
@@ -1050,7 +1078,8 @@
   if (StringInvalid)
     return 0;
 
-  assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet");
+  assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
+         SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
 
 
   const char *SpellingStart = SpellingPtr;
@@ -1075,7 +1104,7 @@
     bool HadError = false;
     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
                       FullSourceLoc(Tok.getLocation(), SM),
-                      false, Diags, Target);
+                      CharByteWidth*8, Diags);
     assert(!HadError && "This method isn't valid on erroneous strings");
     --ByteNo;
   }

diff --git a/lib/Lex/MacroArgs.cpp b/lib/Lex/MacroArgs.cpp
index 968c15e..ccd0b70 100644
--- a/lib/Lex/MacroArgs.cpp
+++ b/lib/Lex/MacroArgs.cpp

@@ -208,7 +208,13 @@
     // by 6.10.3.2p2.
     if (Tok.is(tok::string_literal) ||       // "foo"
         Tok.is(tok::wide_string_literal) ||  // L"foo"
-        Tok.is(tok::char_constant)) {        // 'x' and L'x'.
+        Tok.is(tok::utf8_string_literal) ||  // u8"foo"
+        Tok.is(tok::utf16_string_literal) || // u"foo"
+        Tok.is(tok::utf32_string_literal) || // U"foo"
+        Tok.is(tok::char_constant) ||        // 'x'
+        Tok.is(tok::wide_char_constant) ||   // L'x'.
+        Tok.is(tok::utf16_char_constant) ||  // u'x'.
+        Tok.is(tok::utf32_char_constant)) {  // U'x'.
       bool Invalid = false;
       std::string TokStr = PP.getSpelling(Tok, &Invalid);
       if (!Invalid) {

diff --git a/lib/Lex/PPDirectives.cpp b/lib/Lex/PPDirectives.cpp
index 212ffee..383c6f5 100644
--- a/lib/Lex/PPDirectives.cpp
+++ b/lib/Lex/PPDirectives.cpp

@@ -777,7 +777,7 @@
   } else {
     // Parse and validate the string, converting it into a unique ID.
     StringLiteralParser Literal(&StrTok, 1, *this);
-    assert(!Literal.AnyWide && "Didn't allow wide strings in");
+    assert(Literal.isAscii() && "Didn't allow wide strings in");
     if (Literal.hadError)
       return DiscardUntilEndOfDirective();
     if (Literal.Pascal) {
@@ -910,7 +910,7 @@
   } else {
     // Parse and validate the string, converting it into a unique ID.
     StringLiteralParser Literal(&StrTok, 1, *this);
-    assert(!Literal.AnyWide && "Didn't allow wide strings in");
+    assert(Literal.isAscii() && "Didn't allow wide strings in");
     if (Literal.hadError)
       return DiscardUntilEndOfDirective();
     if (Literal.Pascal) {

diff --git a/lib/Lex/PPExpressions.cpp b/lib/Lex/PPExpressions.cpp
index 08e2705..2581692 100644
--- a/lib/Lex/PPExpressions.cpp
+++ b/lib/Lex/PPExpressions.cpp

@@ -236,7 +236,10 @@
     PP.LexNonComment(PeekTok);
     return false;
   }
-  case tok::char_constant: {   // 'x'
+  case tok::char_constant:          // 'x'
+  case tok::wide_char_constant: {   // L'x'
+  case tok::utf16_char_constant:    // u'x'
+  case tok::utf32_char_constant:    // U'x'
     llvm::SmallString<32> CharBuffer;
     bool CharInvalid = false;
     StringRef ThisTok = PP.getSpelling(PeekTok, CharBuffer, &CharInvalid);
@@ -244,7 +247,7 @@
       return true;
 
     CharLiteralParser Literal(ThisTok.begin(), ThisTok.end(),
-                              PeekTok.getLocation(), PP);
+                              PeekTok.getLocation(), PP, PeekTok.getKind());
     if (Literal.hadError())
       return true;  // A diagnostic was already emitted.
 
@@ -255,6 +258,10 @@
       NumBits = TI.getIntWidth();
     else if (Literal.isWide())
       NumBits = TI.getWCharWidth();
+    else if (Literal.isUTF16())
+      NumBits = TI.getChar16Width();
+    else if (Literal.isUTF32())
+      NumBits = TI.getChar32Width();
     else
       NumBits = TI.getCharWidth();
 
@@ -262,8 +269,9 @@
     llvm::APSInt Val(NumBits);
     // Set the value.
     Val = Literal.getValue();
-    // Set the signedness.
-    Val.setIsUnsigned(!PP.getLangOptions().CharIsSigned);
+    // Set the signedness. UTF-16 and UTF-32 are always unsigned
+    if (!Literal.isUTF16() && !Literal.isUTF32())
+      Val.setIsUnsigned(!PP.getLangOptions().CharIsSigned);
 
     if (Result.Val.getBitWidth() > Val.getBitWidth()) {
       Result.Val = Val.extend(Result.Val.getBitWidth());

diff --git a/lib/Lex/Pragma.cpp b/lib/Lex/Pragma.cpp
index d94e2e8..1d0b5e4 100644
--- a/lib/Lex/Pragma.cpp
+++ b/lib/Lex/Pragma.cpp

@@ -444,7 +444,7 @@
 
     // Concatenate and parse the strings.
     StringLiteralParser Literal(&StrToks[0], StrToks.size(), *this);
-    assert(!Literal.AnyWide && "Didn't allow wide strings in");
+    assert(Literal.isAscii() && "Didn't allow wide strings in");
     if (Literal.hadError)
       return;
     if (Literal.Pascal) {
@@ -520,7 +520,7 @@
 
   // Concatenate and parse the strings.
   StringLiteralParser Literal(&StrToks[0], StrToks.size(), *this);
-  assert(!Literal.AnyWide && "Didn't allow wide strings in");
+  assert(Literal.isAscii() && "Didn't allow wide strings in");
   if (Literal.hadError)
     return;
   if (Literal.Pascal) {
@@ -902,7 +902,7 @@
 
     // Concatenate and parse the strings.
     StringLiteralParser Literal(&StrToks[0], StrToks.size(), PP);
-    assert(!Literal.AnyWide && "Didn't allow wide strings in");
+    assert(Literal.isAscii() && "Didn't allow wide strings in");
     if (Literal.hadError)
       return;
     if (Literal.Pascal) {

diff --git a/lib/Lex/TokenConcatenation.cpp b/lib/Lex/TokenConcatenation.cpp
index 3e9e855..19baf80 100644
--- a/lib/Lex/TokenConcatenation.cpp
+++ b/lib/Lex/TokenConcatenation.cpp

@@ -17,42 +17,39 @@
 using namespace clang;
 
 
-/// StartsWithL - Return true if the spelling of this token starts with 'L'.
-bool TokenConcatenation::StartsWithL(const Token &Tok) const {
+/// IsIdentifierStringPrefix - Return true if the spelling of the token
+/// is literally 'L', 'u', 'U', or 'u8'.
+bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const {
+  const LangOptions &LangOpts = PP.getLangOptions();
+
   if (!Tok.needsCleaning()) {
+    if (Tok.getLength() != 1 && Tok.getLength() != 2)
+      return false;
     SourceManager &SM = PP.getSourceManager();
-    return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L';
+    const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation()));
+    if (Tok.getLength() == 1)
+      return Ptr[0] == 'L' ||
+             (LangOpts.CPlusPlus0x && (Ptr[0] == 'u' || Ptr[0] == 'U'));
+    if (Tok.getLength() == 2)
+      return LangOpts.CPlusPlus0x && Ptr[0] == 'u' && Ptr[1] == '8';
   }
 
   if (Tok.getLength() < 256) {
     char Buffer[256];
     const char *TokPtr = Buffer;
-    PP.getSpelling(Tok, TokPtr);
-    return TokPtr[0] == 'L';
+    unsigned length = PP.getSpelling(Tok, TokPtr);
+    if (length == 1)
+      return TokPtr[0] == 'L' ||
+             (LangOpts.CPlusPlus0x && (TokPtr[0] == 'u' || TokPtr[0] == 'U'));
+    if (length == 2)
+      return LangOpts.CPlusPlus0x && TokPtr[0] == 'u' && TokPtr[1] == '8';
+    return false;
   }
 
-  return PP.getSpelling(Tok)[0] == 'L';
-}
-
-/// IsIdentifierL - Return true if the spelling of this token is literally
-/// 'L'.
-bool TokenConcatenation::IsIdentifierL(const Token &Tok) const {
-  if (!Tok.needsCleaning()) {
-    if (Tok.getLength() != 1)
-      return false;
-    SourceManager &SM = PP.getSourceManager();
-    return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L';
-  }
-
-  if (Tok.getLength() < 256) {
-    char Buffer[256];
-    const char *TokPtr = Buffer;
-    if (PP.getSpelling(Tok, TokPtr) != 1)
-      return false;
-    return TokPtr[0] == 'L';
-  }
-
-  return PP.getSpelling(Tok) == "L";
+  std::string TokStr = PP.getSpelling(Tok);
+  return TokStr == "L" || (LangOpts.CPlusPlus0x && (TokStr == "u8" ||
+                                                    TokStr == "u" ||
+                                                    TokStr == "U"));
 }
 
 TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) {
@@ -179,24 +176,19 @@
     if (Tok.is(tok::numeric_constant))
       return GetFirstChar(PP, Tok) != '.';
 
-    if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) /* ||
-     Tok.is(tok::wide_char_literal)*/)
+    if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) ||
+        Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) ||
+        Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) ||
+        Tok.is(tok::utf16_char_constant) || Tok.is(tok::utf32_char_constant))
       return true;
 
     // If this isn't identifier + string, we're done.
     if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal))
       return false;
 
-    // FIXME: need a wide_char_constant!
-
-    // If the string was a wide string L"foo" or wide char L'f', it would
-    // concat with the previous identifier into fooL"bar".  Avoid this.
-    if (StartsWithL(Tok))
-      return true;
-
     // Otherwise, this is a narrow character or string.  If the *identifier*
-    // is a literal 'L', avoid pasting L "foo" -> L"foo".
-    return IsIdentifierL(PrevTok);
+    // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo".
+    return IsIdentifierStringPrefix(PrevTok);
   case tok::numeric_constant:
     return isalnum(FirstChar) || Tok.is(tok::numeric_constant) ||
     FirstChar == '+' || FirstChar == '-' || FirstChar == '.';

diff --git a/lib/Parse/ParseCXXInlineMethods.cpp b/lib/Parse/ParseCXXInlineMethods.cpp
index f5c6998..e164480 100644
--- a/lib/Parse/ParseCXXInlineMethods.cpp
+++ b/lib/Parse/ParseCXXInlineMethods.cpp

@@ -553,6 +553,9 @@
 
     case tok::string_literal:
     case tok::wide_string_literal:
+    case tok::utf8_string_literal:
+    case tok::utf16_string_literal:
+    case tok::utf32_string_literal:
       Toks.push_back(Tok);
       ConsumeStringToken();
       break;

diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp
index 869d9de..3cd1f39 100644
--- a/lib/Parse/ParseExpr.cpp
+++ b/lib/Parse/ParseExpr.cpp

@@ -769,6 +769,9 @@
     break;
   }
   case tok::char_constant:     // constant: character-constant
+  case tok::wide_char_constant:
+  case tok::utf16_char_constant:
+  case tok::utf32_char_constant:
     Res = Actions.ActOnCharacterConstant(Tok);
     ConsumeToken();
     break;
@@ -780,6 +783,9 @@
     break;
   case tok::string_literal:    // primary-expression: string-literal
   case tok::wide_string_literal:
+  case tok::utf8_string_literal:
+  case tok::utf16_string_literal:
+  case tok::utf32_string_literal:
     Res = ParseStringLiteralExpression();
     break;
   case tok::kw__Generic:   // primary-expression: generic-selection [C1X 6.5.1]

diff --git a/lib/Parse/ParseTentative.cpp b/lib/Parse/ParseTentative.cpp
index 2ba0fc6..3f245a3 100644
--- a/lib/Parse/ParseTentative.cpp
+++ b/lib/Parse/ParseTentative.cpp

@@ -605,8 +605,14 @@
   // Obviously starts an expression.
   case tok::numeric_constant:
   case tok::char_constant:
+  case tok::wide_char_constant:
+  case tok::utf16_char_constant:
+  case tok::utf32_char_constant:
   case tok::string_literal:
   case tok::wide_string_literal:
+  case tok::utf8_string_literal:
+  case tok::utf16_string_literal:
+  case tok::utf32_string_literal:
   case tok::l_square:
   case tok::l_paren:
   case tok::amp:

diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp
index 9dc867c..5bb4165 100644
--- a/lib/Parse/Parser.cpp
+++ b/lib/Parse/Parser.cpp

@@ -298,6 +298,9 @@
 
     case tok::string_literal:
     case tok::wide_string_literal:
+    case tok::utf8_string_literal:
+    case tok::utf16_string_literal:
+    case tok::utf32_string_literal:
       ConsumeStringToken();
       break;
         

diff --git a/lib/Rewrite/HTMLRewrite.cpp b/lib/Rewrite/HTMLRewrite.cpp
index 27f383f..ad2491c 100644
--- a/lib/Rewrite/HTMLRewrite.cpp
+++ b/lib/Rewrite/HTMLRewrite.cpp

@@ -397,8 +397,15 @@
       HighlightRange(RB, TokOffs, TokOffs+TokLen, BufferStart,
                      "<span class='comment'>", "</span>");
       break;
+    case tok::utf8_string_literal:
+      // Chop off the u part of u8 prefix
+      ++TokOffs;
+      --TokLen;
+      // FALL THROUGH to chop the 8
     case tok::wide_string_literal:
-      // Chop off the L prefix
+    case tok::utf16_string_literal:
+    case tok::utf32_string_literal:
+      // Chop off the L, u, U or 8 prefix
       ++TokOffs;
       --TokLen;
       // FALL THROUGH.

diff --git a/lib/Rewrite/RewriteObjC.cpp b/lib/Rewrite/RewriteObjC.cpp
index 585b43c..a8fefb0 100644
--- a/lib/Rewrite/RewriteObjC.cpp
+++ b/lib/Rewrite/RewriteObjC.cpp

@@ -2111,8 +2111,8 @@
   std::string StrEncoding;
   Context->getObjCEncodingForType(Exp->getEncodedType(), StrEncoding);
   Expr *Replacement = StringLiteral::Create(*Context, StrEncoding,
-                                            false, false, StrType,
-                                            SourceLocation());
+                                            StringLiteral::Ascii, false,
+                                            StrType, SourceLocation());
   ReplaceStmt(Exp, Replacement);
 
   // Replace this subexpr in the parent.
@@ -2129,8 +2129,8 @@
   QualType argType = Context->getPointerType(Context->CharTy);
   SelExprs.push_back(StringLiteral::Create(*Context,
                                            Exp->getSelector().getAsString(),
-                                           false, false, argType, 
-                                           SourceLocation()));
+                                           StringLiteral::Ascii, false,
+                                           argType, SourceLocation()));
   CallExpr *SelExp = SynthesizeCallToFunctionDecl(SelGetUidFunctionDecl,
                                                  &SelExprs[0], SelExprs.size());
   ReplaceStmt(Exp, SelExp);
@@ -2797,7 +2797,8 @@
     QualType argType = Context->getPointerType(Context->CharTy);
     ClsExprs.push_back(StringLiteral::Create(*Context,
                                    ClassDecl->getIdentifier()->getName(),
-                                   false, false, argType, SourceLocation()));
+                                   StringLiteral::Ascii, false,
+                                   argType, SourceLocation()));
     CallExpr *Cls = SynthesizeCallToFunctionDecl(GetMetaClassFunctionDecl,
                                                  &ClsExprs[0],
                                                  ClsExprs.size(),
@@ -2875,7 +2876,7 @@
     IdentifierInfo *clsName = Class->getIdentifier();
     ClsExprs.push_back(StringLiteral::Create(*Context,
                                              clsName->getName(),
-                                             false, false, 
+                                             StringLiteral::Ascii, false,
                                              argType, SourceLocation()));
     CallExpr *Cls = SynthesizeCallToFunctionDecl(GetClassFunctionDecl,
                                                  &ClsExprs[0],
@@ -2906,7 +2907,8 @@
     QualType argType = Context->getPointerType(Context->CharTy);
     ClsExprs.push_back(StringLiteral::Create(*Context,
                                    ClassDecl->getIdentifier()->getName(),
-                                   false, false, argType, SourceLocation()));
+                                   StringLiteral::Ascii, false, argType,
+                                   SourceLocation()));
     CallExpr *Cls = SynthesizeCallToFunctionDecl(GetClassFunctionDecl,
                                                  &ClsExprs[0],
                                                  ClsExprs.size(), 
@@ -2987,7 +2989,8 @@
   QualType argType = Context->getPointerType(Context->CharTy);
   SelExprs.push_back(StringLiteral::Create(*Context,
                                        Exp->getSelector().getAsString(),
-                                       false, false, argType, SourceLocation()));
+                                       StringLiteral::Ascii, false,
+                                       argType, SourceLocation()));
   CallExpr *SelExp = SynthesizeCallToFunctionDecl(SelGetUidFunctionDecl,
                                                  &SelExprs[0], SelExprs.size(),
                                                   StartLoc,

diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp
index 2e4198b..28085ef 100644
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp

@@ -605,7 +605,7 @@
   Arg = Arg->IgnoreParenCasts();
   StringLiteral *Literal = dyn_cast<StringLiteral>(Arg);
 
-  if (!Literal || Literal->isWide()) {
+  if (!Literal || !Literal->isAscii()) {
     Diag(Arg->getLocStart(), diag::err_cfstring_literal_not_string_constant)
       << Arg->getSourceRange();
     return true;
@@ -1805,7 +1805,7 @@
                              bool isPrintf) {
   
   // CHECK: is the format string a wide literal?
-  if (FExpr->isWide()) {
+  if (!FExpr->isAscii()) {
     Diag(FExpr->getLocStart(),
          diag::warn_format_string_is_wide_literal)
     << OrigFormatExpr->getSourceRange();

diff --git a/lib/Sema/SemaDeclAttr.cpp b/lib/Sema/SemaDeclAttr.cpp
index 9e20bc9..2cbd83a 100644
--- a/lib/Sema/SemaDeclAttr.cpp
+++ b/lib/Sema/SemaDeclAttr.cpp

@@ -712,7 +712,7 @@
     Arg = Arg->IgnoreParenCasts();
     StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
 
-    if (Str == 0 || Str->isWide()) {
+    if (!Str || !Str->isAscii()) {
       S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string)
           << "weakref" << 1;
       return;
@@ -737,7 +737,7 @@
   Arg = Arg->IgnoreParenCasts();
   StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
 
-  if (Str == 0 || Str->isWide()) {
+  if (!Str || !Str->isAscii()) {
     S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string)
       << "alias" << 1;
     return;
@@ -1162,7 +1162,7 @@
   Arg = Arg->IgnoreParenCasts();
   StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
 
-  if (Str == 0 || Str->isWide()) {
+  if (!Str || !Str->isAscii()) {
     S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string)
       << "visibility" << 1;
     return;
@@ -2464,7 +2464,7 @@
   case AttributeList::AT_pcs: {
     Expr *Arg = Attr.getArg(0);
     StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
-    if (Str == 0 || Str->isWide()) {
+    if (!Str || !Str->isAscii()) {
       S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string)
         << "pcs" << 1;
       Attr.setInvalid();
@@ -2519,7 +2519,7 @@
   case AttributeList::AT_pcs: {
     Expr *Arg = attr.getArg(0);
     StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
-    if (Str == 0 || Str->isWide()) {
+    if (!Str || !Str->isAscii()) {
       Diag(attr.getLoc(), diag::err_attribute_argument_n_not_string)
         << "pcs" << 1;
       attr.setInvalid();
@@ -2868,7 +2868,7 @@
 
     Expr *Arg = Attr.getArg(0);
     StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
-    if (Str == 0 || Str->isWide()) {
+    if (!Str || !Str->isAscii()) {
       S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string)
         << "uuid" << 1;
       return;

diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index 4a9b4bc..dedf7b0 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp

@@ -997,11 +997,25 @@
     StringTokLocs.push_back(StringToks[i].getLocation());
 
   QualType StrTy = Context.CharTy;
-  if (Literal.AnyWide) 
+  if (Literal.isWide())
     StrTy = Context.getWCharType();
+  else if (Literal.isUTF16())
+    StrTy = Context.Char16Ty;
+  else if (Literal.isUTF32())
+    StrTy = Context.Char32Ty;
   else if (Literal.Pascal)
     StrTy = Context.UnsignedCharTy;
 
+  StringLiteral::StringKind Kind = StringLiteral::Ascii;
+  if (Literal.isWide())
+    Kind = StringLiteral::Wide;
+  else if (Literal.isUTF8())
+    Kind = StringLiteral::UTF8;
+  else if (Literal.isUTF16())
+    Kind = StringLiteral::UTF16;
+  else if (Literal.isUTF32())
+    Kind = StringLiteral::UTF32;
+
   // A C++ string literal has a const-qualified element type (C++ 2.13.4p1).
   if (getLangOptions().CPlusPlus || getLangOptions().ConstStrings)
     StrTy.addConst();
@@ -1015,7 +1029,7 @@
 
   // Pass &StringTokLocs[0], StringTokLocs.size() to factory!
   return Owned(StringLiteral::Create(Context, Literal.GetString(),
-                                     Literal.AnyWide, Literal.Pascal, StrTy,
+                                     Kind, Literal.Pascal, StrTy,
                                      &StringTokLocs[0],
                                      StringTokLocs.size()));
 }
@@ -2412,7 +2426,7 @@
     return ExprError();
 
   CharLiteralParser Literal(ThisTok.begin(), ThisTok.end(), Tok.getLocation(),
-                            PP);
+                            PP, Tok.getKind());
   if (Literal.hadError())
     return ExprError();
 
@@ -2421,14 +2435,25 @@
     Ty = Context.IntTy;   // 'x' and L'x' -> int in C.
   else if (Literal.isWide())
     Ty = Context.WCharTy; // L'x' -> wchar_t in C++.
+  else if (Literal.isUTF16())
+    Ty = Context.Char16Ty; // u'x' -> char16_t in C++0x.
+  else if (Literal.isUTF32())
+    Ty = Context.Char32Ty; // U'x' -> char32_t in C++0x.
   else if (Literal.isMultiChar())
     Ty = Context.IntTy;   // 'wxyz' -> int in C++.
   else
     Ty = Context.CharTy;  // 'x' -> char in C++
 
-  return Owned(new (Context) CharacterLiteral(Literal.getValue(),
-                                              Literal.isWide(),
-                                              Ty, Tok.getLocation()));
+  CharacterLiteral::CharacterKind Kind = CharacterLiteral::Ascii;
+  if (Literal.isWide())
+    Kind = CharacterLiteral::Wide;
+  else if (Literal.isUTF16())
+    Kind = CharacterLiteral::UTF16;
+  else if (Literal.isUTF32())
+    Kind = CharacterLiteral::UTF32;
+
+  return Owned(new (Context) CharacterLiteral(Literal.getValue(), Kind, Ty,
+                                              Tok.getLocation()));
 }
 
 ExprResult Sema::ActOnNumericConstant(const Token &Tok) {
@@ -8624,7 +8649,7 @@
 
   // Strip off any parens and casts.
   StringLiteral *SL = dyn_cast<StringLiteral>(SrcExpr->IgnoreParenCasts());
-  if (!SL || SL->isWide())
+  if (!SL || !SL->isAscii())
     return;
 
   Hint = FixItHint::CreateInsertion(SL->getLocStart(), "@");

diff --git a/lib/Sema/SemaExprCXX.cpp b/lib/Sema/SemaExprCXX.cpp
index 94a5baf..1812510 100644
--- a/lib/Sema/SemaExprCXX.cpp
+++ b/lib/Sema/SemaExprCXX.cpp

@@ -2041,12 +2041,20 @@
           = ToPtrType->getPointeeType()->getAs<BuiltinType>()) {
         // This conversion is considered only when there is an
         // explicit appropriate pointer target type (C++ 4.2p2).
-        if (!ToPtrType->getPointeeType().hasQualifiers() &&
-            ((StrLit->isWide() && ToPointeeType->isWideCharType()) ||
-             (!StrLit->isWide() &&
-              (ToPointeeType->getKind() == BuiltinType::Char_U ||
-               ToPointeeType->getKind() == BuiltinType::Char_S))))
-          return true;
+        if (!ToPtrType->getPointeeType().hasQualifiers()) {
+          switch (StrLit->getKind()) {
+            case StringLiteral::UTF8:
+            case StringLiteral::UTF16:
+            case StringLiteral::UTF32:
+              // We don't allow UTF literals to be implicitly converted
+              break;
+            case StringLiteral::Ascii:
+              return (ToPointeeType->getKind() == BuiltinType::Char_U ||
+                      ToPointeeType->getKind() == BuiltinType::Char_S);
+            case StringLiteral::Wide:
+              return ToPointeeType->isWideCharType();
+          }
+        }
       }
 
   return false;

diff --git a/lib/Sema/SemaExprObjC.cpp b/lib/Sema/SemaExprObjC.cpp
index fccea7c..e88726b 100644
--- a/lib/Sema/SemaExprObjC.cpp
+++ b/lib/Sema/SemaExprObjC.cpp

@@ -47,8 +47,8 @@
     for (unsigned i = 0; i != NumStrings; ++i) {
       S = Strings[i];
 
-      // ObjC strings can't be wide.
-      if (S->isWide()) {
+      // ObjC strings can't be wide or UTF.
+      if (!S->isAscii()) {
         Diag(S->getLocStart(), diag::err_cfstring_literal_not_string_constant)
           << S->getSourceRange();
         return true;
@@ -64,7 +64,7 @@
     // Create the aggregate string with the appropriate content and location
     // information.
     S = StringLiteral::Create(Context, StrBuf,
-                              /*Wide=*/false, /*Pascal=*/false,
+                              StringLiteral::Ascii, /*Pascal=*/false,
                               Context.getPointerType(Context.CharTy),
                               &StrLocs[0], StrLocs.size());
   }

diff --git a/lib/Sema/SemaInit.cpp b/lib/Sema/SemaInit.cpp
index adf88c6..c406ad9 100644
--- a/lib/Sema/SemaInit.cpp
+++ b/lib/Sema/SemaInit.cpp

@@ -49,20 +49,30 @@
   if (SL == 0) return 0;
 
   QualType ElemTy = Context.getCanonicalType(AT->getElementType());
-  // char array can be initialized with a narrow string.
-  // Only allow char x[] = "foo";  not char x[] = L"foo";
-  if (!SL->isWide())
+
+  switch (SL->getKind()) {
+  case StringLiteral::Ascii:
+  case StringLiteral::UTF8:
+    // char array can be initialized with a narrow string.
+    // Only allow char x[] = "foo";  not char x[] = L"foo";
     return ElemTy->isCharType() ? Init : 0;
+  case StringLiteral::UTF16:
+    return ElemTy->isChar16Type() ? Init : 0;
+  case StringLiteral::UTF32:
+    return ElemTy->isChar32Type() ? Init : 0;
+  case StringLiteral::Wide:
+    // wchar_t array can be initialized with a wide string: C99 6.7.8p15 (with
+    // correction from DR343): "An array with element type compatible with a
+    // qualified or unqualified version of wchar_t may be initialized by a wide
+    // string literal, optionally enclosed in braces."
+    if (Context.typesAreCompatible(Context.getWCharType(),
+                                   ElemTy.getUnqualifiedType()))
+      return Init;
 
-  // wchar_t array can be initialized with a wide string: C99 6.7.8p15 (with
-  // correction from DR343): "An array with element type compatible with a
-  // qualified or unqualified version of wchar_t may be initialized by a wide
-  // string literal, optionally enclosed in braces."
-  if (Context.typesAreCompatible(Context.getWCharType(),
-                                 ElemTy.getUnqualifiedType()))
-    return Init;
+    return 0;
+  }
 
-  return 0;
+  llvm_unreachable("missed a StringLiteral kind?");
 }
 
 static Expr *IsStringInit(Expr *init, QualType declType, ASTContext &Context) {

diff --git a/lib/Sema/SemaStmt.cpp b/lib/Sema/SemaStmt.cpp
index 0fd3f03..56161ed 100644
--- a/lib/Sema/SemaStmt.cpp
+++ b/lib/Sema/SemaStmt.cpp

@@ -1952,13 +1952,13 @@
   SmallVector<TargetInfo::ConstraintInfo, 4> OutputConstraintInfos;
 
   // The parser verifies that there is a string literal here.
-  if (AsmString->isWide())
+  if (!AsmString->isAscii())
     return StmtError(Diag(AsmString->getLocStart(),diag::err_asm_wide_character)
       << AsmString->getSourceRange());
 
   for (unsigned i = 0; i != NumOutputs; i++) {
     StringLiteral *Literal = Constraints[i];
-    if (Literal->isWide())
+    if (!Literal->isAscii())
       return StmtError(Diag(Literal->getLocStart(),diag::err_asm_wide_character)
         << Literal->getSourceRange());
 
@@ -1987,7 +1987,7 @@
 
   for (unsigned i = NumOutputs, e = NumOutputs + NumInputs; i != e; i++) {
     StringLiteral *Literal = Constraints[i];
-    if (Literal->isWide())
+    if (!Literal->isAscii())
       return StmtError(Diag(Literal->getLocStart(),diag::err_asm_wide_character)
         << Literal->getSourceRange());
 
@@ -2034,7 +2034,7 @@
   // Check that the clobbers are valid.
   for (unsigned i = 0; i != NumClobbers; i++) {
     StringLiteral *Literal = Clobbers[i];
-    if (Literal->isWide())
+    if (!Literal->isAscii())
       return StmtError(Diag(Literal->getLocStart(),diag::err_asm_wide_character)
         << Literal->getSourceRange());
 

diff --git a/lib/Sema/SemaTemplate.cpp b/lib/Sema/SemaTemplate.cpp
index ceab7e9..006017f 100644
--- a/lib/Sema/SemaTemplate.cpp
+++ b/lib/Sema/SemaTemplate.cpp

@@ -4131,10 +4131,22 @@
   assert(Arg.getKind() == TemplateArgument::Integral &&
          "Operation is only valid for integral template arguments");
   QualType T = Arg.getIntegralType();
-  if (T->isCharType() || T->isWideCharType())
+  if (T->isAnyCharacterType()) {
+    CharacterLiteral::CharacterKind Kind;
+    if (T->isWideCharType())
+      Kind = CharacterLiteral::Wide;
+    else if (T->isChar16Type())
+      Kind = CharacterLiteral::UTF16;
+    else if (T->isChar32Type())
+      Kind = CharacterLiteral::UTF32;
+    else
+      Kind = CharacterLiteral::Ascii;
+
     return Owned(new (Context) CharacterLiteral(
-                                             Arg.getAsIntegral()->getZExtValue(),
-                                             T->isWideCharType(), T, Loc));
+                                            Arg.getAsIntegral()->getZExtValue(),
+                                            Kind, T, Loc));
+  }
+
   if (T->isBooleanType())
     return Owned(new (Context) CXXBoolLiteralExpr(
                                             Arg.getAsIntegral()->getBoolValue(),

diff --git a/lib/Serialization/ASTReaderStmt.cpp b/lib/Serialization/ASTReaderStmt.cpp
index 3559cce..7a3c589 100644
--- a/lib/Serialization/ASTReaderStmt.cpp
+++ b/lib/Serialization/ASTReaderStmt.cpp

@@ -371,7 +371,7 @@
   assert(Record[Idx] == E->getNumConcatenated() &&
          "Wrong number of concatenated tokens!");
   ++Idx;
-  E->IsWide = Record[Idx++];
+  E->Kind = static_cast<StringLiteral::StringKind>(Record[Idx++]);
   E->IsPascal = Record[Idx++];
 
   // Read string data
@@ -388,7 +388,7 @@
   VisitExpr(E);
   E->setValue(Record[Idx++]);
   E->setLocation(ReadSourceLocation(Record, Idx));
-  E->setWide(Record[Idx++]);
+  E->setKind(static_cast<CharacterLiteral::CharacterKind>(Record[Idx++]));
 }
 
 void ASTStmtReader::VisitParenExpr(ParenExpr *E) {

diff --git a/lib/Serialization/ASTWriterStmt.cpp b/lib/Serialization/ASTWriterStmt.cpp
index 0b5bc1f..f0636a1 100644
--- a/lib/Serialization/ASTWriterStmt.cpp
+++ b/lib/Serialization/ASTWriterStmt.cpp

@@ -324,7 +324,7 @@
   VisitExpr(E);
   Record.push_back(E->getByteLength());
   Record.push_back(E->getNumConcatenated());
-  Record.push_back(E->isWide());
+  Record.push_back(E->getKind());
   Record.push_back(E->isPascal());
   // FIXME: String data should be stored as a blob at the end of the
   // StringLiteral. However, we can't do so now because we have no
@@ -340,7 +340,7 @@
   VisitExpr(E);
   Record.push_back(E->getValue());
   Writer.AddSourceLocation(E->getLocation(), Record);
-  Record.push_back(E->isWide());
+  Record.push_back(E->getKind());
 
   AbbrevToUse = Writer.getCharacterLiteralAbbrev();
commit	5cee1195584fa8672253139c86e922daeda69b9e	[log] [tgz]
author	Douglas Gregor <dgregor@apple.com>	Wed Jul 27 05:40:30 2011 +0000
committer	Douglas Gregor <dgregor@apple.com>	Wed Jul 27 05:40:30 2011 +0000
tree	e1b36e0f628359bb42d22d78c74e931057b962de
parent	6fa8f86b8188c6d3c4d6616122a71ccd72a0c78a [diff]