[MS Demangler] Demangle string literals.
When demangling string literals, Microsoft's undname
simply prints 'string'. This patch implements string
literal demangling while doing a bit better than this
by decoding as much of the string as possible and
trying to faithfully reproduce the original string
literal definition.
This is a bit tricky because the different character
types char, char16_t, and char32_t are not uniquely
identified by the mangling, so we have to use a
heuristic to try to guess the character type. But
it works pretty well, and many tests are added to
illustrate the behavior.
Differential Revision: https://reviews.llvm.org/D50806
llvm-svn: 339892
diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index 99a1f23..d5bb9a4 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -213,7 +213,7 @@
NBB_Simple = 1 << 1, // save simple names.
};
-enum class SymbolCategory { Function, Variable, Unknown };
+enum class SymbolCategory { Unknown, Function, Variable, StringLiteral };
namespace {
@@ -291,6 +291,11 @@
bool IsOperator = false;
bool IsBackReference = false;
bool IsConversionOperator = false;
+ bool IsStringLiteral = false;
+ bool IsLongStringLiteral = false;
+
+ // If IsStringLiteral is true, this is the character type.
+ PrimTy StringLiteralType = PrimTy::None;
// Name read from an MangledName string.
StringView Str;
@@ -521,6 +526,30 @@
}
}
+static void outputStringLiteral(OutputStream &OS, const Name &TheString) {
+ assert(TheString.IsStringLiteral);
+ switch (TheString.StringLiteralType) {
+ case PrimTy::Wchar:
+ OS << "const wchar_t * {L\"";
+ break;
+ case PrimTy::Char:
+ OS << "const char * {\"";
+ break;
+ case PrimTy::Char16:
+ OS << "const char16_t * {u\"";
+ break;
+ case PrimTy::Char32:
+ OS << "const char32_t * {U\"";
+ break;
+ default:
+ LLVM_BUILTIN_UNREACHABLE;
+ }
+ OS << TheString.Str << "\"";
+ if (TheString.IsLongStringLiteral)
+ OS << "...";
+ OS << "}";
+}
+
static void outputName(OutputStream &OS, const Name *TheName, const Type *Ty,
NameResolver &Resolver);
@@ -1009,6 +1038,7 @@
Name *demangleSimpleName(StringView &MangledName, bool Memorize);
Name *demangleAnonymousNamespaceName(StringView &MangledName);
Name *demangleLocallyScopedNamePiece(StringView &MangledName);
+ Name *demangleStringLiteral(StringView &MangledName);
StringView demangleSimpleString(StringView &MangledName, bool Memorize);
@@ -1017,6 +1047,8 @@
StorageClass demangleVariableStorageClass(StringView &MangledName);
ReferenceKind demangleReferenceKind(StringView &MangledName);
void demangleThrowSpecification(StringView &MangledName);
+ wchar_t demangleWcharLiteral(StringView &MangledName);
+ uint8_t demangleCharLiteral(StringView &MangledName);
std::pair<Qualifiers, bool> demangleQualifiers(StringView &MangledName);
@@ -1056,16 +1088,25 @@
S->Category = SymbolCategory::Unknown;
S->SymbolName = Arena.alloc<Name>();
S->SymbolName->Str = MangledName;
+ S->SymbolType = nullptr;
MangledName = StringView();
return S;
}
// MSVC-style mangled symbols must start with '?'.
if (!MangledName.consumeFront("?")) {
+ S->Category = SymbolCategory::Unknown;
S->SymbolName = Arena.alloc<Name>();
S->SymbolName->Str = MangledName;
- S->SymbolType = Arena.alloc<Type>();
- S->SymbolType->Prim = PrimTy::Unknown;
+ S->SymbolType = nullptr;
+ return S;
+ }
+
+ if (MangledName.consumeFront("?_C@_")) {
+ // This is a string literal. Just demangle it and return.
+ S->Category = SymbolCategory::StringLiteral;
+ S->SymbolName = demangleStringLiteral(MangledName);
+ S->SymbolType = nullptr;
return S;
}
@@ -1325,6 +1366,32 @@
return "|=";
case '6':
return "^=";
+ // case '7': # vftable
+ // case '8': # vbtable
+ // case '9': # vcall
+ // case 'A': # typeof
+ // case 'B': # local static guard
+ // case 'D': # vbase destructor
+ // case 'E': # vector deleting destructor
+ // case 'F': # default constructor closure
+ // case 'G': # scalar deleting destructor
+ // case 'H': # vector constructor iterator
+ // case 'I': # vector destructor iterator
+ // case 'J': # vector vbase constructor iterator
+ // case 'K': # virtual displacement map
+ // case 'L': # eh vector constructor iterator
+ // case 'M': # eh vector destructor iterator
+ // case 'N': # eh vector vbase constructor iterator
+ // case 'O': # copy constructor closure
+ // case 'P<name>': # udt returning <name>
+ // case 'Q': # <unknown>
+ // case 'R0': # RTTI Type Descriptor
+ // case 'R1': # RTTI Base Class Descriptor at (a,b,c,d)
+ // case 'R2': # RTTI Base Class Array
+ // case 'R3': # RTTI Class Hierarchy Descriptor
+ // case 'R4': # RTTI Complete Object Locator
+ // case 'S': # local vftable
+ // case 'T': # local vftable constructor closure
case 'U':
return " new[]";
case 'V':
@@ -1359,6 +1426,9 @@
} else {
Node->Str = NameString();
}
+ if (Error)
+ return nullptr;
+
Node->IsOperator = true;
return Node;
}
@@ -1373,6 +1443,326 @@
return Node;
}
+static bool isRebasedHexDigit(char C) { return (C >= 'A' && C <= 'P'); }
+
+static uint8_t rebasedHexDigitToNumber(char C) {
+ assert(isRebasedHexDigit(C));
+ return (C <= 'J') ? (C - 'A') : (10 + C - 'K');
+}
+
+uint8_t Demangler::demangleCharLiteral(StringView &MangledName) {
+ if (!MangledName.startsWith('?'))
+ return MangledName.popFront();
+
+ MangledName = MangledName.dropFront();
+ if (MangledName.empty())
+ goto CharLiteralError;
+
+ if (MangledName.consumeFront('$')) {
+ // Two hex digits
+ if (MangledName.size() < 2)
+ goto CharLiteralError;
+ StringView Nibbles = MangledName.substr(0, 2);
+ if (!isRebasedHexDigit(Nibbles[0]) || !isRebasedHexDigit(Nibbles[1]))
+ goto CharLiteralError;
+ // Don't append the null terminator.
+ uint8_t C1 = rebasedHexDigitToNumber(Nibbles[0]);
+ uint8_t C2 = rebasedHexDigitToNumber(Nibbles[1]);
+ MangledName = MangledName.dropFront(2);
+ return (C1 << 4) | C2;
+ }
+
+ if (startsWithDigit(MangledName)) {
+ const char *Lookup = ",/\\:. \n\t'-";
+ char C = Lookup[MangledName[0] - '0'];
+ MangledName = MangledName.dropFront();
+ return C;
+ }
+
+ if (MangledName[0] >= 'a' && MangledName[0] <= 'z') {
+ char Lookup[26] = {'\xE1', '\xE2', '\xE3', '\xE4', '\xE5', '\xE6', '\xE7',
+ '\xE8', '\xE9', '\xEA', '\xEB', '\xEC', '\xED', '\xEE',
+ '\xEF', '\xF0', '\xF1', '\xF2', '\xF3', '\xF4', '\xF5',
+ '\xF6', '\xF7', '\xF8', '\xF9', '\xFA'};
+ char C = Lookup[MangledName[0] - 'a'];
+ MangledName = MangledName.dropFront();
+ return C;
+ }
+
+ if (MangledName[0] >= 'A' && MangledName[0] <= 'Z') {
+ char Lookup[26] = {'\xC1', '\xC2', '\xC3', '\xC4', '\xC5', '\xC6', '\xC7',
+ '\xC8', '\xC9', '\xCA', '\xCB', '\xCC', '\xCD', '\xCE',
+ '\xCF', '\xD0', '\xD1', '\xD2', '\xD3', '\xD4', '\xD5',
+ '\xD6', '\xD7', '\xD8', '\xD9', '\xDA'};
+ char C = Lookup[MangledName[0] - 'A'];
+ MangledName = MangledName.dropFront();
+ return C;
+ }
+
+CharLiteralError:
+ Error = true;
+ return '\0';
+}
+
+wchar_t Demangler::demangleWcharLiteral(StringView &MangledName) {
+ uint8_t C1 = demangleCharLiteral(MangledName);
+ if (Error)
+ goto WCharLiteralError;
+ uint8_t C2 = demangleCharLiteral(MangledName);
+ if (Error)
+ goto WCharLiteralError;
+
+ return ((wchar_t)C1 << 8) | (wchar_t)C2;
+
+WCharLiteralError:
+ Error = true;
+ return L'\0';
+}
+
+static void writeHexDigit(char *Buffer, uint8_t Digit) {
+ assert(Digit <= 15);
+ *Buffer = (Digit < 10) ? ('0' + Digit) : ('A' + Digit - 10);
+}
+
+static void outputHex(OutputStream &OS, unsigned C) {
+ if (C == 0) {
+ OS << "\\x00";
+ return;
+ }
+ // It's easier to do the math if we can work from right to left, but we need
+ // to print the numbers from left to right. So render this into a temporary
+ // buffer first, then output the temporary buffer. Each byte is of the form
+ // \xAB, which means that each byte needs 4 characters. Since there are at
+ // most 4 bytes, we need a 4*4+1 = 17 character temporary buffer.
+ char TempBuffer[17];
+
+ ::memset(TempBuffer, 0, sizeof(TempBuffer));
+ constexpr int MaxPos = 15;
+
+ int Pos = MaxPos - 1;
+ while (C != 0) {
+ for (int I = 0; I < 2; ++I) {
+ writeHexDigit(&TempBuffer[Pos--], C % 16);
+ C /= 16;
+ }
+ TempBuffer[Pos--] = 'x';
+ TempBuffer[Pos--] = '\\';
+ assert(Pos >= 0);
+ }
+ OS << StringView(&TempBuffer[Pos + 1]);
+}
+
+static void outputEscapedChar(OutputStream &OS, unsigned C) {
+ switch (C) {
+ case '\'': // single quote
+ OS << "\\\'";
+ return;
+ case '\"': // double quote
+ OS << "\\\"";
+ return;
+ case '\\': // backslash
+ OS << "\\\\";
+ return;
+ case '\a': // bell
+ OS << "\\a";
+ return;
+ case '\b': // backspace
+ OS << "\\b";
+ return;
+ case '\f': // form feed
+ OS << "\\f";
+ return;
+ case '\n': // new line
+ OS << "\\n";
+ return;
+ case '\r': // carriage return
+ OS << "\\r";
+ return;
+ case '\t': // tab
+ OS << "\\t";
+ return;
+ case '\v': // vertical tab
+ OS << "\\v";
+ return;
+ default:
+ break;
+ }
+
+ if (C > 0x1F && C < 0x7F) {
+ // Standard ascii char.
+ OS << (char)C;
+ return;
+ }
+
+ outputHex(OS, C);
+}
+
+unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) {
+ const uint8_t *End = StringBytes + Length - 1;
+ while (Length > 0 && *End == 0) {
+ --Length;
+ --End;
+ }
+ return End - StringBytes + 1;
+}
+
+unsigned countEmbeddedNulls(const uint8_t *StringBytes, unsigned Length) {
+ unsigned Result = 0;
+ for (unsigned I = 0; I < Length; ++I) {
+ if (*StringBytes++ == 0)
+ ++Result;
+ }
+ return Result;
+}
+
+unsigned guessCharByteSize(const uint8_t *StringBytes, unsigned NumChars,
+ unsigned NumBytes) {
+ assert(NumBytes > 0);
+
+ // If the number of bytes is odd, this is guaranteed to be a char string.
+ if (NumBytes % 2 == 1)
+ return 1;
+
+ // All strings can encode at most 32 bytes of data. If it's less than that,
+ // then we encoded the entire string. In this case we check for a 1-byte,
+ // 2-byte, or 4-byte null terminator.
+ if (NumBytes < 32) {
+ unsigned TrailingNulls = countTrailingNullBytes(StringBytes, NumChars);
+ if (TrailingNulls >= 4)
+ return 4;
+ if (TrailingNulls >= 2)
+ return 2;
+ return 1;
+ }
+
+ // The whole string was not able to be encoded. Try to look at embedded null
+ // terminators to guess. The heuristic is that we count all embedded null
+ // terminators. If more than 2/3 are null, it's a char32. If more than 1/3
+ // are null, it's a char16. Otherwise it's a char8. This obviously isn't
+ // perfect and is biased towards languages that have ascii alphabets, but this
+ // was always going to be best effort since the encoding is lossy.
+ unsigned Nulls = countEmbeddedNulls(StringBytes, NumChars);
+ if (Nulls >= 2 * NumChars / 3)
+ return 4;
+ if (Nulls >= NumChars / 3)
+ return 2;
+ return 1;
+}
+
+static unsigned decodeMultiByteChar(const uint8_t *StringBytes,
+ unsigned CharIndex, unsigned CharBytes) {
+ assert(CharBytes == 1 || CharBytes == 2 || CharBytes == 4);
+ unsigned Offset = CharIndex * CharBytes;
+ unsigned Result = 0;
+ StringBytes = StringBytes + Offset;
+ for (unsigned I = 0; I < CharBytes; ++I) {
+ unsigned C = static_cast<unsigned>(StringBytes[I]);
+ Result |= C << (8 * I);
+ }
+ return Result;
+}
+
+Name *Demangler::demangleStringLiteral(StringView &MangledName) {
+ OutputStream OS;
+ StringView CRC;
+ Name *Result = Arena.alloc<Name>();
+ Result->IsStringLiteral = true;
+
+ // Prefix indicating the beginning of a string literal
+ if (MangledName.empty())
+ goto StringLiteralError;
+
+ // Char Type (regular or wchar_t)
+ bool IsWcharT = false;
+ switch (MangledName.popFront()) {
+ case '1':
+ IsWcharT = true;
+ LLVM_FALLTHROUGH;
+ case '0':
+ break;
+ default:
+ goto StringLiteralError;
+ }
+
+ // Encoded Length
+ uint64_t StringByteSize;
+ bool IsNegative;
+ std::tie(StringByteSize, IsNegative) = demangleNumber(MangledName);
+ if (Error || IsNegative)
+ goto StringLiteralError;
+
+ // CRC 32 (always 8 characters plus a terminator)
+ size_t CrcEndPos = MangledName.find('@');
+ if (CrcEndPos == StringView::npos)
+ goto StringLiteralError;
+ CRC = MangledName.substr(0, CrcEndPos);
+ MangledName = MangledName.dropFront(CrcEndPos + 1);
+ if (MangledName.empty())
+ goto StringLiteralError;
+
+ OS = OutputStream::create(nullptr, nullptr, 1024);
+ if (IsWcharT) {
+ Result->StringLiteralType = PrimTy::Wchar;
+ if (StringByteSize > 64)
+ Result->IsLongStringLiteral = true;
+
+ while (!MangledName.consumeFront('@')) {
+ assert(StringByteSize >= 2);
+ wchar_t W = demangleWcharLiteral(MangledName);
+ if (StringByteSize != 2 || Result->IsLongStringLiteral)
+ outputEscapedChar(OS, W);
+ StringByteSize -= 2;
+ if (Error)
+ goto StringLiteralError;
+ }
+ } else {
+ if (StringByteSize > 32)
+ Result->IsLongStringLiteral = true;
+
+ constexpr unsigned MaxStringByteLength = 32;
+ uint8_t StringBytes[MaxStringByteLength];
+
+ unsigned BytesDecoded = 0;
+ while (!MangledName.consumeFront('@')) {
+ assert(StringByteSize >= 1);
+ StringBytes[BytesDecoded++] = demangleCharLiteral(MangledName);
+ }
+
+ unsigned CharBytes =
+ guessCharByteSize(StringBytes, BytesDecoded, StringByteSize);
+ assert(StringByteSize % CharBytes == 0);
+ switch (CharBytes) {
+ case 1:
+ Result->StringLiteralType = PrimTy::Char;
+ break;
+ case 2:
+ Result->StringLiteralType = PrimTy::Char16;
+ break;
+ case 4:
+ Result->StringLiteralType = PrimTy::Char32;
+ break;
+ default:
+ LLVM_BUILTIN_UNREACHABLE;
+ }
+ const unsigned NumChars = BytesDecoded / CharBytes;
+ for (unsigned CharIndex = 0; CharIndex < NumChars; ++CharIndex) {
+ unsigned NextChar =
+ decodeMultiByteChar(StringBytes, CharIndex, CharBytes);
+ if (CharIndex + 1 < NumChars || Result->IsLongStringLiteral)
+ outputEscapedChar(OS, NextChar);
+ }
+ }
+
+ OS << '\0';
+ char *ResultBuffer = OS.getBuffer();
+ Result->Str = copyString(ResultBuffer);
+ return Result;
+
+StringLiteralError:
+ Error = true;
+ return nullptr;
+}
+
StringView Demangler::demangleSimpleString(StringView &MangledName,
bool Memorize) {
StringView S;
@@ -2193,6 +2583,11 @@
outputName(OS, S->SymbolName, S->SymbolType, *this);
return;
}
+ if (S->Category == SymbolCategory::StringLiteral) {
+ outputStringLiteral(OS, *S->SymbolName);
+ return;
+ }
+
// Converts an AST to a string.
//
// Converting an AST representing a C++ type to a string is tricky due