Implement C++0x user-defined string literals.

The extra data stored on user-defined literal Tokens is stored in extra
allocated memory, which is managed by the PreprocessorLexer because there isn't
a better place to put it that makes sure it gets deallocated, but only after
it's used up. My testing has shown no significant slowdown as a result, but
independent testing would be appreciated.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@112458 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 6cd1873..b4cafb4 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -548,6 +548,11 @@
   isInited = true;
 }
 
+/// isIdentifierStart - Return true if this is the start character of an
+/// identifier, which is [a-zA-Z_].
+static inline bool isIdentifierStart(unsigned char c) {
+  return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false;
+}
 
 /// isIdentifierBody - Return true if this is the body character of an
 /// identifier, which is [a-zA-Z0-9_].
@@ -982,8 +987,30 @@
 
   // Update the location of the token as well as the BufferPtr instance var.
   const char *TokStart = BufferPtr;
-  FormTokenWithChars(Result, CurPtr,
-                     Wide ? tok::wide_string_literal : tok::string_literal);
+  tok::TokenKind Kind = Wide ? tok::wide_string_literal : tok::string_literal;
+
+  // FIXME: Handle UCNs
+  unsigned Size;
+  if (PP && PP->getLangOptions().CPlusPlus0x &&
+      isIdentifierStart(getCharAndSize(CurPtr, Size))) {
+    Result.makeUserDefinedLiteral(ExtraDataAllocator);
+    Result.setFlagValue(Token::LiteralPortionClean, !Result.needsCleaning());
+    Result.setKind(Kind);
+    Result.setLiteralLength(CurPtr - BufferPtr);
+
+    // FIXME: We hack around the lexer's routines a lot here.
+    BufferPtr = CurPtr;
+    bool OldRawMode = LexingRawMode;
+    LexingRawMode = true;
+    LexIdentifier(Result, ConsumeChar(CurPtr, Size, Result));
+    LexingRawMode = OldRawMode;
+    PP->LookUpIdentifierInfo(Result, CurPtr);
+
+    CurPtr = BufferPtr;
+    BufferPtr = TokStart;
+  }
+
+  FormTokenWithChars(Result, CurPtr, Kind);
   Result.setLiteralData(TokStart);
 }
 
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index a12c4ae..eb7337a 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -758,30 +758,38 @@
 ///
 StringLiteralParser::
 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
-                    Preprocessor &pp, bool Complain) : PP(pp) {
+                    Preprocessor &pp, bool Complain) : PP(pp), hadError(false) {
   // Scan all of the string portions, remember the max individual token length,
   // computing a bound on the concatenated string length, and see whether any
   // piece is a wide-string.  If any of the string portions is a wide-string
   // literal, the result is a wide-string literal [C99 6.4.5p4].
-  MaxTokenLength = StringToks[0].getLength();
-  SizeBound = StringToks[0].getLength()-2;  // -2 for "".
+  MaxTokenLength = StringToks[0].getLiteralLength();
+  SizeBound = StringToks[0].getLiteralLength()-2;  // -2 for "".
   AnyWide = StringToks[0].is(tok::wide_string_literal);
-
-  hadError = false;
+  UDSuffix = StringToks[0].getIdentifierInfo();
 
   // Implement Translation Phase #6: concatenation of string literals
   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
   for (unsigned i = 1; i != NumStringToks; ++i) {
     // The string could be shorter than this if it needs cleaning, but this is a
     // reasonable bound, which is all we need.
-    SizeBound += StringToks[i].getLength()-2;  // -2 for "".
+    SizeBound += StringToks[i].getLiteralLength()-2;  // -2 for "".
 
     // Remember maximum string piece length.
-    if (StringToks[i].getLength() > MaxTokenLength)
-      MaxTokenLength = StringToks[i].getLength();
+    if (StringToks[i].getLiteralLength() > MaxTokenLength)
+      MaxTokenLength = StringToks[i].getLiteralLength();
 
     // Remember if we see any wide strings.
     AnyWide |= StringToks[i].is(tok::wide_string_literal);
+
+    if (StringToks[i].isUserDefinedLiteral()) {
+      if (UDSuffix && UDSuffix != StringToks[i].getIdentifierInfo()) {
+        // FIXME: Improve location and note previous
+        PP.Diag(StringToks[0].getLocation(), diag::err_ud_suffix_mismatch);
+        hadError = true;
+      } else if (!UDSuffix)
+        UDSuffix = StringToks[0].getIdentifierInfo();
+    }
   }
 
   // Include space for the null terminator.
@@ -823,7 +831,7 @@
     // and 'spelled' tokens can only shrink.
     bool StringInvalid = false;
     unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf, 
-                                         &StringInvalid);
+                                         &StringInvalid, true);
     if (StringInvalid) {
       hadError = 1;
       continue;
@@ -938,7 +946,7 @@
                                                     bool Complain) {
   // Get the spelling of the token.
   llvm::SmallString<16> SpellingBuffer;
-  SpellingBuffer.resize(Tok.getLength());
+  SpellingBuffer.resize(Tok.getLiteralLength());
 
   bool StringInvalid = false;
   const char *SpellingPtr = &SpellingBuffer[0];
diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp
index 5160acf..f52d354 100644
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -352,15 +352,25 @@
 /// to point to a constant buffer with the data already in it (avoiding a
 /// copy).  The caller is not allowed to modify the returned buffer pointer
 /// if an internal buffer is returned.
-unsigned Preprocessor::getSpelling(const Token &Tok,
-                                   const char *&Buffer, bool *Invalid) const {
+///
+/// If LiteralOnly is specified, only the literal portion of the token is
+/// processed.
+unsigned Preprocessor::getSpelling(const Token &Tok, const char *&Buffer,
+                                   bool *Invalid, bool LiteralOnly) const {
   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
+  assert((!LiteralOnly || Tok.isLiteral()) &&
+         "LiteralOnly used on a non-literal token");
+
+  unsigned (Token::*getLength) () const =
+    LiteralOnly ? &Token::getLiteralLength : &Token::getLength;
 
   // If this token is an identifier, just return the string from the identifier
   // table, which is very quick.
   if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
-    Buffer = II->getNameStart();
-    return II->getLength();
+    if (!Tok.isUserDefinedLiteral()) {
+      Buffer = II->getNameStart();
+      return II->getLength();
+    }
   }
 
   // Otherwise, compute the start of the token in the input lexer buffer.
@@ -381,20 +391,20 @@
   }
 
   // If this token contains nothing interesting, return it directly.
-  if (!Tok.needsCleaning()) {
+  if (!(LiteralOnly ? Tok.literalNeedsCleaning() : Tok.needsCleaning())) {
     Buffer = TokStart;
-    return Tok.getLength();
+    return (Tok.*getLength)();
   }
 
   // Otherwise, hard case, relex the characters into the string.
   char *OutBuf = const_cast<char*>(Buffer);
-  for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
+  for (const char *Ptr = TokStart, *End = TokStart+(Tok.*getLength)();
        Ptr != End; ) {
     unsigned CharSize;
     *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features);
     Ptr += CharSize;
   }
-  assert(unsigned(OutBuf-Buffer) != Tok.getLength() &&
+  assert(unsigned(OutBuf-Buffer) != (Tok.*getLength)() &&
          "NeedsCleaning flag set on something that didn't need cleaning!");
 
   return OutBuf-Buffer;