Initial support for semantic analysis and AST building for StringExpr nodes.
llvm-svn: 38960
diff --git a/clang/Parse/ParseExpr.cpp b/clang/Parse/ParseExpr.cpp
index 54284d0..3b641e7 100644
--- a/clang/Parse/ParseExpr.cpp
+++ b/clang/Parse/ParseExpr.cpp
@@ -22,6 +22,8 @@
#include "clang/Parse/Parser.h"
#include "clang/Basic/Diagnostic.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/Alloca.h"
using namespace llvm;
using namespace clang;
@@ -490,6 +492,7 @@
// These can be followed by postfix-expr pieces.
return ParsePostfixExpressionSuffix(Res);
case tok::string_literal: // primary-expression: string-literal
+ case tok::wide_string_literal:
Res = ParseStringLiteralExpression();
if (Res.isInvalid) return Res;
// This can be followed by postfix-expr pieces (e.g. "foo"[1]).
@@ -809,24 +812,6 @@
return ParsePostfixExpressionSuffix(Res);
}
-/// ParseStringLiteralExpression - This handles the various token types that
-/// form string literals, and also handles string concatenation [C99 5.1.1.2,
-/// translation phase #6].
-///
-/// primary-expression: [C99 6.5.1]
-/// string-literal
-Parser::ExprResult Parser::ParseStringLiteralExpression() {
- assert(isTokenStringLiteral() && "Not a string literal!");
- ConsumeStringToken();
-
- // String concat. Note that keywords like __func__ and __FUNCTION__ aren't
- // considered to be strings.
- while (isTokenStringLiteral())
- ConsumeStringToken();
- // TODO: Build AST for string literals.
- return ExprResult(false);
-}
-
/// ParseParenExpression - This parses the unit that starts with a '(' token,
/// based on what is allowed by ExprType. The actual thing parsed is returned
@@ -906,3 +891,223 @@
return Result;
}
+
+/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
+/// not valid.
+static int HexDigitValue(char C) {
+ if (C >= '0' && C <= '9') return C-'0';
+ if (C >= 'a' && C <= 'f') return C-'a'+10;
+ if (C >= 'A' && C <= 'F') return C-'A'+10;
+ return -1;
+}
+
+/// ParseStringLiteralExpression - This handles the various token types that
+/// form string literals, and also handles string concatenation [C99 5.1.1.2,
+/// translation phase #6].
+///
+/// primary-expression: [C99 6.5.1]
+/// string-literal
+Parser::ExprResult Parser::ParseStringLiteralExpression() {
+ assert(isTokenStringLiteral() && "Not a string literal!");
+
+ // String concat. Note that keywords like __func__ and __FUNCTION__ are not
+ // considered to be strings for concatenation purposes.
+ SmallVector<LexerToken, 4> StringToks;
+
+ // While we're looking at all of the string portions, remember the max
+ // individual token length, computing a bound on the concatenated string
+ // length, and see whether any piece is a wide-string. If any of the string
+ // portions is a wide-string literal, the result is also a wide-string literal
+ // [C99 6.4.5p4].
+ unsigned SizeBound = 0, MaxTokenLength = 0;
+ bool AnyWide = false;
+ do {
+ // The string could be shorter than this if it needs cleaning, but this is a
+ // reasonable bound, which is all we need.
+ SizeBound += Tok.getLength()-2; // -2 for "".
+
+ // Find maximum string piece length.
+ if (Tok.getLength() > MaxTokenLength)
+ MaxTokenLength = Tok.getLength();
+
+ // Remember if we see any wide strings.
+ AnyWide |= Tok.getKind() == tok::wide_string_literal;
+
+ // Remember the string token.
+ StringToks.push_back(Tok);
+ ConsumeStringToken();
+ } while (isTokenStringLiteral());
+
+ // Include space for the null terminator.
+ ++SizeBound;
+
+ // TODO: K&R warning: "traditional C rejects string constant concatenation"
+
+ // FIXME: Size of wchar_t should not be hardcoded!
+ unsigned wchar_tByteWidth = 4;
+
+ // The output buffer size needs to be large enough to hold wide characters.
+ // This is a worst-case assumption which basically corresponds to L"" "long".
+ if (AnyWide)
+ SizeBound *= wchar_tByteWidth;
+
+ // Create a temporary buffer to hold the result string data. If it is "big",
+ // use malloc, otherwise use alloca.
+ char *ResultBuf;
+ if (SizeBound > 512)
+ ResultBuf = (char*)malloc(SizeBound);
+ else
+ ResultBuf = (char*)alloca(SizeBound);
+
+ // Likewise, but for each string piece.
+ char *TokenBuf;
+ if (MaxTokenLength > 512)
+ TokenBuf = (char*)malloc(MaxTokenLength);
+ else
+ TokenBuf = (char*)alloca(MaxTokenLength);
+
+ // Loop over all the strings, getting their spelling, and expanding them to
+ // wide strings as appropriate.
+ char *ResultPtr = ResultBuf; // Next byte to fill in.
+
+ for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
+ const char *ThisTokBuf = TokenBuf;
+ // Get the spelling of the token, which eliminates trigraphs, etc. We know
+ // that ThisTokBuf points to a buffer that is big enough for the whole token
+ // and 'spelled' tokens can only shrink.
+ unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
+ const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
+
+ // TODO: Input character set mapping support.
+
+ // Skip L marker for wide strings.
+ if (ThisTokBuf[0] == 'L') ++ThisTokBuf;
+
+ assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
+ ++ThisTokBuf;
+
+ while (ThisTokBuf != ThisTokEnd) {
+ // Is this a span of non-escape characters?
+ if (ThisTokBuf[0] != '\\') {
+ const char *InStart = ThisTokBuf;
+ do {
+ ++ThisTokBuf;
+ } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
+
+ // Copy the character span over.
+ unsigned Len = ThisTokBuf-InStart;
+ if (!AnyWide) {
+ memcpy(ResultPtr, InStart, Len);
+ ResultPtr += Len;
+ } else {
+ // Note: our internal rep of wide char tokens is always little-endian.
+ for (; Len; --Len, ++InStart) {
+ *ResultPtr++ = InStart[0];
+ // Add zeros at the end.
+ for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+ *ResultPtr++ = 0;
+ }
+ }
+ continue;
+ }
+
+ // Otherwise, this is an escape character. Skip the '\' char.
+ ++ThisTokBuf;
+
+ // We know that this character can't be off the end of the buffer, because
+ // that would have been \", which would not have been the end of string.
+ unsigned ResultChar = *ThisTokBuf++;
+ switch (ResultChar) {
+ // These map to themselves.
+ case '\\': case '\'': case '"': case '?': break;
+
+ // These have fixed mappings.
+ case 'a':
+ // TODO: K&R: the meaning of '\\a' is different in traditional C
+ ResultChar = 7;
+ break;
+ case 'b':
+ ResultChar = 8;
+ break;
+ case 'e':
+ PP.Diag(StringToks[i], diag::ext_nonstandard_escape, "e");
+ ResultChar = 27;
+ break;
+ case 'f':
+ ResultChar = 12;
+ break;
+ case 'n':
+ ResultChar = 10;
+ break;
+ case 'r':
+ ResultChar = 13;
+ break;
+ case 't':
+ ResultChar = 9;
+ break;
+ case 'v':
+ ResultChar = 11;
+ break;
+
+ //case 'u': case 'U': // FIXME: UCNs.
+ case 'x': // Hex escape.
+ if (ThisTokBuf == ThisTokEnd ||
+ (ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
+ PP.Diag(StringToks[i], diag::err_hex_escape_no_digits);
+ ResultChar = 0;
+ break;
+ }
+ ++ThisTokBuf; // Consumed one hex digit.
+
+ assert(0 && "hex escape: unimp!");
+ break;
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ // Octal escapes.
+ assert(0 && "octal escape: unimp!");
+ break;
+
+ // Otherwise, these are not valid escapes.
+ case '(': case '{': case '[': case '%':
+ // GCC accepts these as extensions. We warn about them as such though.
+ if (!PP.getLangOptions().NoExtensions) {
+ PP.Diag(StringToks[i], diag::ext_nonstandard_escape,
+ std::string()+(char)ResultChar);
+ break;
+ }
+ // FALL THROUGH.
+ default:
+ if (isgraph(ThisTokBuf[0])) {
+ PP.Diag(StringToks[i], diag::ext_unknown_escape,
+ std::string()+(char)ResultChar);
+ } else {
+ PP.Diag(StringToks[i], diag::ext_unknown_escape,
+ "x"+utohexstr(ResultChar));
+ }
+ }
+
+ // Note: our internal rep of wide char tokens is always little-endian.
+ for (unsigned i = 0, e = wchar_tByteWidth; i != e; ++i)
+ *ResultPtr++ = ResultChar >> i*8;
+ }
+ }
+
+ // Add zero terminator.
+ *ResultPtr = 0;
+ if (AnyWide) {
+ for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+ *ResultPtr++ = 0;
+ }
+
+ // Hand this off to the Actions.
+ ExprResult Res = Actions.ParseStringExpr(ResultBuf, ResultPtr-ResultBuf,
+ AnyWide,
+ &StringToks[0], StringToks.size());
+
+ // If either buffer was heap allocated, release it now.
+ if (MaxTokenLength > 512) free(TokenBuf);
+ if (SizeBound > 512) free(ResultBuf);
+
+ return Res;
+}
+