[MC] - Don't assert when non-english characters are used.
I found that llvm-mc does not like non-english characters even in comments,
which it tries to tokenize.
Problem happens because of functions like isdigit(), isalnum() which takes
int argument and expects it is not negative.
But at the same time MCParser uses char* to store input buffer poiner, char has signed value,
so it is possible to pass negative value to one of functions from above and
that triggers an assert.
Testcase for demonstration is provided.
To fix the issue helper functions were introduced in StringExtras.h
Differential revision: https://reviews.llvm.org/D38461
llvm-svn: 314883
diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index 2b96360..e9123b9 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -14,6 +14,7 @@
#include "llvm/MC/MCParser/AsmLexer.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/MC/MCAsmInfo.h"
@@ -68,7 +69,7 @@
/// consumed.
AsmToken AsmLexer::LexFloatLiteral() {
// Skip the fractional digit sequence.
- while (isdigit(*CurPtr))
+ while (isDigit(*CurPtr))
++CurPtr;
// Check for exponent; we intentionally accept a slighlty wider set of
@@ -78,7 +79,7 @@
++CurPtr;
if (*CurPtr == '-' || *CurPtr == '+')
++CurPtr;
- while (isdigit(*CurPtr))
+ while (isDigit(*CurPtr))
++CurPtr;
}
@@ -102,7 +103,7 @@
++CurPtr;
const char *FracStart = CurPtr;
- while (isxdigit(*CurPtr))
+ while (isHexDigit(*CurPtr))
++CurPtr;
NoFracDigits = CurPtr == FracStart;
@@ -123,7 +124,7 @@
// N.b. exponent digits are *not* hex
const char *ExpStart = CurPtr;
- while (isdigit(*CurPtr))
+ while (isDigit(*CurPtr))
++CurPtr;
if (CurPtr == ExpStart)
@@ -135,15 +136,15 @@
/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
static bool IsIdentifierChar(char c, bool AllowAt) {
- return isalnum(c) || c == '_' || c == '$' || c == '.' ||
+ return isAlnum(c) || c == '_' || c == '$' || c == '.' ||
(c == '@' && AllowAt) || c == '?';
}
AsmToken AsmLexer::LexIdentifier() {
// Check for floating point literals.
- if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
+ if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
// Disambiguate a .1243foo identifier from a floating literal.
- while (isdigit(*CurPtr))
+ while (isDigit(*CurPtr))
++CurPtr;
if (*CurPtr == 'e' || *CurPtr == 'E' ||
!IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
@@ -244,9 +245,9 @@
const char *FirstHex = nullptr;
const char *LookAhead = CurPtr;
while (true) {
- if (isdigit(*LookAhead)) {
+ if (isDigit(*LookAhead)) {
++LookAhead;
- } else if (isxdigit(*LookAhead)) {
+ } else if (isHexDigit(*LookAhead)) {
if (!FirstHex)
FirstHex = LookAhead;
++LookAhead;
@@ -282,7 +283,7 @@
const char *FirstNonBinary = (CurPtr[-1] != '0' && CurPtr[-1] != '1') ?
CurPtr - 1 : nullptr;
const char *OldCurPtr = CurPtr;
- while (isxdigit(*CurPtr)) {
+ while (isHexDigit(*CurPtr)) {
if (*CurPtr != '0' && *CurPtr != '1' && !FirstNonBinary)
FirstNonBinary = CurPtr;
++CurPtr;
@@ -346,7 +347,7 @@
if (!IsParsingMSInlineAsm && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
++CurPtr;
// See if we actually have "0b" as part of something like "jmp 0b\n"
- if (!isdigit(CurPtr[0])) {
+ if (!isDigit(CurPtr[0])) {
--CurPtr;
StringRef Result(TokStart, CurPtr - TokStart);
return AsmToken(AsmToken::Integer, Result, 0);
@@ -375,7 +376,7 @@
if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
++CurPtr;
const char *NumStart = CurPtr;
- while (isxdigit(CurPtr[0]))
+ while (isHexDigit(CurPtr[0]))
++CurPtr;
// "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be