[MC] - Don't assert when non-english characters are used. I found that llvm-mc does not like non-english characters even in comments, which it tries to tokenize. Problem happens because of functions like isdigit(), isalnum() which takes int argument and expects it is not negative. But at the same time MCParser uses char* to store input buffer poiner, char has signed value, so it is possible to pass negative value to one of functions from above and that triggers an assert. Testcase for demonstration is provided. To fix the issue helper functions were introduced in StringExtras.h Differential revision: https://reviews.llvm.org/D38461 llvm-svn: 314883

commit: 099960d3222ded960033943a7b7a26340b0d7440 [log] [tgz]
author: George Rimar <grimar@accesssoftek.com> Wed Oct 04 08:50:08 2017 +0000
committer: George Rimar <grimar@accesssoftek.com> Wed Oct 04 08:50:08 2017 +0000
tree: 833f6d17efac4934c4b255a2953e3582fbc151fd
parent: a1a3f5c5e68793cb5bae5c798494e194cbe8ce58 [diff] [blame]
diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index 2b96360..e9123b9 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp

@@ -14,6 +14,7 @@
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -68,7 +69,7 @@
 /// consumed.
 AsmToken AsmLexer::LexFloatLiteral() {
   // Skip the fractional digit sequence.
-  while (isdigit(*CurPtr))
+  while (isDigit(*CurPtr))
     ++CurPtr;
 
   // Check for exponent; we intentionally accept a slighlty wider set of
@@ -78,7 +79,7 @@
     ++CurPtr;
     if (*CurPtr == '-' || *CurPtr == '+')
       ++CurPtr;
-    while (isdigit(*CurPtr))
+    while (isDigit(*CurPtr))
       ++CurPtr;
   }
 
@@ -102,7 +103,7 @@
     ++CurPtr;
 
     const char *FracStart = CurPtr;
-    while (isxdigit(*CurPtr))
+    while (isHexDigit(*CurPtr))
       ++CurPtr;
 
     NoFracDigits = CurPtr == FracStart;
@@ -123,7 +124,7 @@
 
   // N.b. exponent digits are *not* hex
   const char *ExpStart = CurPtr;
-  while (isdigit(*CurPtr))
+  while (isDigit(*CurPtr))
     ++CurPtr;
 
   if (CurPtr == ExpStart)
@@ -135,15 +136,15 @@
 
 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
 static bool IsIdentifierChar(char c, bool AllowAt) {
-  return isalnum(c) || c == '_' || c == '$' || c == '.' ||
+  return isAlnum(c) || c == '_' || c == '$' || c == '.' ||
          (c == '@' && AllowAt) || c == '?';
 }
 
 AsmToken AsmLexer::LexIdentifier() {
   // Check for floating point literals.
-  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
+  if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
     // Disambiguate a .1243foo identifier from a floating literal.
-    while (isdigit(*CurPtr))
+    while (isDigit(*CurPtr))
       ++CurPtr;
     if (*CurPtr == 'e' || *CurPtr == 'E' ||
         !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
@@ -244,9 +245,9 @@
   const char *FirstHex = nullptr;
   const char *LookAhead = CurPtr;
   while (true) {
-    if (isdigit(*LookAhead)) {
+    if (isDigit(*LookAhead)) {
       ++LookAhead;
-    } else if (isxdigit(*LookAhead)) {
+    } else if (isHexDigit(*LookAhead)) {
       if (!FirstHex)
         FirstHex = LookAhead;
       ++LookAhead;
@@ -282,7 +283,7 @@
     const char *FirstNonBinary = (CurPtr[-1] != '0' && CurPtr[-1] != '1') ?
                                    CurPtr - 1 : nullptr;
     const char *OldCurPtr = CurPtr;
-    while (isxdigit(*CurPtr)) {
+    while (isHexDigit(*CurPtr)) {
       if (*CurPtr != '0' && *CurPtr != '1' && !FirstNonBinary)
         FirstNonBinary = CurPtr;
       ++CurPtr;
@@ -346,7 +347,7 @@
   if (!IsParsingMSInlineAsm && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
     ++CurPtr;
     // See if we actually have "0b" as part of something like "jmp 0b\n"
-    if (!isdigit(CurPtr[0])) {
+    if (!isDigit(CurPtr[0])) {
       --CurPtr;
       StringRef Result(TokStart, CurPtr - TokStart);
       return AsmToken(AsmToken::Integer, Result, 0);
@@ -375,7 +376,7 @@
   if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
     ++CurPtr;
     const char *NumStart = CurPtr;
-    while (isxdigit(CurPtr[0]))
+    while (isHexDigit(CurPtr[0]))
       ++CurPtr;
 
     // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
commit	099960d3222ded960033943a7b7a26340b0d7440	[log] [tgz]
author	George Rimar <grimar@accesssoftek.com>	Wed Oct 04 08:50:08 2017 +0000
committer	George Rimar <grimar@accesssoftek.com>	Wed Oct 04 08:50:08 2017 +0000
tree	833f6d17efac4934c4b255a2953e3582fbc151fd
parent	a1a3f5c5e68793cb5bae5c798494e194cbe8ce58 [diff] [blame]