| //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// | 
 | // | 
 | //                     The LLVM Compiler Infrastructure | 
 | // | 
 | // This file is distributed under the University of Illinois Open Source | 
 | // License. See LICENSE.TXT for details. | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 | // | 
 | // Implement the Lexer for TableGen. | 
 | // | 
 | //===----------------------------------------------------------------------===// | 
 |  | 
 | #include "TGLexer.h" | 
 | #include "llvm/Support/SourceMgr.h" | 
 | #include "llvm/Support/MemoryBuffer.h" | 
 | #include "llvm/Config/config.h" | 
 | #include <cctype> | 
 | #include <cstdio> | 
 | #include <cstdlib> | 
 | #include <cstring> | 
 | #include <cerrno> | 
 | using namespace llvm; | 
 |  | 
 | TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { | 
 |   CurBuffer = 0; | 
 |   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); | 
 |   CurPtr = CurBuf->getBufferStart(); | 
 |   TokStart = 0; | 
 | } | 
 |  | 
 | SMLoc TGLexer::getLoc() const { | 
 |   return SMLoc::getFromPointer(TokStart); | 
 | } | 
 |  | 
 |  | 
 | /// ReturnError - Set the error to the specified string at the specified | 
 | /// location.  This is defined to always return tgtok::Error. | 
 | tgtok::TokKind TGLexer::ReturnError(const char *Loc, const std::string &Msg) { | 
 |   PrintError(Loc, Msg); | 
 |   return tgtok::Error; | 
 | } | 
 |  | 
 |  | 
 | void TGLexer::PrintError(const char *Loc, const std::string &Msg) const { | 
 |   SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), Msg, "error"); | 
 | } | 
 |  | 
 | void TGLexer::PrintError(SMLoc Loc, const std::string &Msg) const { | 
 |   SrcMgr.PrintMessage(Loc, Msg, "error"); | 
 | } | 
 |  | 
 |  | 
 | int TGLexer::getNextChar() { | 
 |   char CurChar = *CurPtr++; | 
 |   switch (CurChar) { | 
 |   default: | 
 |     return (unsigned char)CurChar; | 
 |   case 0: { | 
 |     // A nul character in the stream is either the end of the current buffer or | 
 |     // a random nul in the file.  Disambiguate that here. | 
 |     if (CurPtr-1 != CurBuf->getBufferEnd()) | 
 |       return 0;  // Just whitespace. | 
 |      | 
 |     // If this is the end of an included file, pop the parent file off the | 
 |     // include stack. | 
 |     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); | 
 |     if (ParentIncludeLoc != SMLoc()) { | 
 |       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); | 
 |       CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); | 
 |       CurPtr = ParentIncludeLoc.getPointer(); | 
 |       return getNextChar(); | 
 |     } | 
 |      | 
 |     // Otherwise, return end of file. | 
 |     --CurPtr;  // Another call to lex will return EOF again.   | 
 |     return EOF; | 
 |   } | 
 |   case '\n': | 
 |   case '\r': | 
 |     // Handle the newline character by ignoring it and incrementing the line | 
 |     // count.  However, be careful about 'dos style' files with \n\r in them. | 
 |     // Only treat a \n\r or \r\n as a single line. | 
 |     if ((*CurPtr == '\n' || (*CurPtr == '\r')) && | 
 |         *CurPtr != CurChar) | 
 |       ++CurPtr;  // Eat the two char newline sequence. | 
 |     return '\n'; | 
 |   }   | 
 | } | 
 |  | 
 | tgtok::TokKind TGLexer::LexToken() { | 
 |   TokStart = CurPtr; | 
 |   // This always consumes at least one character. | 
 |   int CurChar = getNextChar(); | 
 |  | 
 |   switch (CurChar) { | 
 |   default: | 
 |     // Handle letters: [a-zA-Z_] | 
 |     if (isalpha(CurChar) || CurChar == '_' || CurChar == '#') | 
 |       return LexIdentifier(); | 
 |        | 
 |     // Unknown character, emit an error. | 
 |     return ReturnError(TokStart, "Unexpected character"); | 
 |   case EOF: return tgtok::Eof; | 
 |   case ':': return tgtok::colon; | 
 |   case ';': return tgtok::semi; | 
 |   case '.': return tgtok::period; | 
 |   case ',': return tgtok::comma; | 
 |   case '<': return tgtok::less; | 
 |   case '>': return tgtok::greater; | 
 |   case ']': return tgtok::r_square; | 
 |   case '{': return tgtok::l_brace; | 
 |   case '}': return tgtok::r_brace; | 
 |   case '(': return tgtok::l_paren; | 
 |   case ')': return tgtok::r_paren; | 
 |   case '=': return tgtok::equal; | 
 |   case '?': return tgtok::question; | 
 |        | 
 |   case 0: | 
 |   case ' ': | 
 |   case '\t': | 
 |   case '\n': | 
 |   case '\r': | 
 |     // Ignore whitespace. | 
 |     return LexToken(); | 
 |   case '/': | 
 |     // If this is the start of a // comment, skip until the end of the line or | 
 |     // the end of the buffer. | 
 |     if (*CurPtr == '/') | 
 |       SkipBCPLComment(); | 
 |     else if (*CurPtr == '*') { | 
 |       if (SkipCComment()) | 
 |         return tgtok::Error; | 
 |     } else // Otherwise, this is an error. | 
 |       return ReturnError(TokStart, "Unexpected character"); | 
 |     return LexToken(); | 
 |   case '-': case '+': | 
 |   case '0': case '1': case '2': case '3': case '4': case '5': case '6': | 
 |   case '7': case '8': case '9':   | 
 |     return LexNumber(); | 
 |   case '"': return LexString(); | 
 |   case '$': return LexVarName(); | 
 |   case '[': return LexBracket(); | 
 |   case '!': return LexExclaim(); | 
 |   } | 
 | } | 
 |  | 
 | /// LexString - Lex "[^"]*" | 
 | tgtok::TokKind TGLexer::LexString() { | 
 |   const char *StrStart = CurPtr; | 
 |    | 
 |   CurStrVal = ""; | 
 |    | 
 |   while (*CurPtr != '"') { | 
 |     // If we hit the end of the buffer, report an error. | 
 |     if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd()) | 
 |       return ReturnError(StrStart, "End of file in string literal"); | 
 |      | 
 |     if (*CurPtr == '\n' || *CurPtr == '\r') | 
 |       return ReturnError(StrStart, "End of line in string literal"); | 
 |      | 
 |     if (*CurPtr != '\\') { | 
 |       CurStrVal += *CurPtr++; | 
 |       continue; | 
 |     } | 
 |  | 
 |     ++CurPtr; | 
 |      | 
 |     switch (*CurPtr) { | 
 |     case '\\': case '\'': case '"': | 
 |       // These turn into their literal character. | 
 |       CurStrVal += *CurPtr++; | 
 |       break; | 
 |     case 't': | 
 |       CurStrVal += '\t'; | 
 |       ++CurPtr; | 
 |       break; | 
 |     case 'n': | 
 |       CurStrVal += '\n'; | 
 |       ++CurPtr; | 
 |       break; | 
 |          | 
 |     case '\n': | 
 |     case '\r': | 
 |       return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); | 
 |  | 
 |     // If we hit the end of the buffer, report an error. | 
 |     case '\0': | 
 |       if (CurPtr == CurBuf->getBufferEnd()) | 
 |         return ReturnError(StrStart, "End of file in string literal"); | 
 |       // FALL THROUGH | 
 |     default: | 
 |       return ReturnError(CurPtr, "invalid escape in string literal"); | 
 |     } | 
 |   } | 
 |    | 
 |   ++CurPtr; | 
 |   return tgtok::StrVal; | 
 | } | 
 |  | 
 | tgtok::TokKind TGLexer::LexVarName() { | 
 |   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') | 
 |     return ReturnError(TokStart, "Invalid variable name"); | 
 |    | 
 |   // Otherwise, we're ok, consume the rest of the characters. | 
 |   const char *VarNameStart = CurPtr++; | 
 |    | 
 |   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') | 
 |     ++CurPtr; | 
 |  | 
 |   CurStrVal.assign(VarNameStart, CurPtr); | 
 |   return tgtok::VarName; | 
 | } | 
 |  | 
 |  | 
 | tgtok::TokKind TGLexer::LexIdentifier() { | 
 |   // The first letter is [a-zA-Z_]. | 
 |   const char *IdentStart = TokStart; | 
 |    | 
 |   // Match the rest of the identifier regex: [0-9a-zA-Z_]* | 
 |   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_' | 
 |          || *CurPtr == '#') { | 
 |     // If this contains a '#', make sure it's value | 
 |     if (*CurPtr == '#') { | 
 |       if (strncmp(CurPtr, "#NAME#", 6) != 0) { | 
 |         return tgtok::Error; | 
 |       } | 
 |       CurPtr += 6; | 
 |     } | 
 |     else { | 
 |       ++CurPtr; | 
 |     } | 
 |   } | 
 |    | 
 |    | 
 |   // Check to see if this identifier is a keyword. | 
 |   unsigned Len = CurPtr-IdentStart; | 
 |    | 
 |   if (Len == 3 && !memcmp(IdentStart, "int", 3)) return tgtok::Int; | 
 |   if (Len == 3 && !memcmp(IdentStart, "bit", 3)) return tgtok::Bit; | 
 |   if (Len == 4 && !memcmp(IdentStart, "bits", 4)) return tgtok::Bits; | 
 |   if (Len == 6 && !memcmp(IdentStart, "string", 6)) return tgtok::String; | 
 |   if (Len == 4 && !memcmp(IdentStart, "list", 4)) return tgtok::List; | 
 |   if (Len == 4 && !memcmp(IdentStart, "code", 4)) return tgtok::Code; | 
 |   if (Len == 3 && !memcmp(IdentStart, "dag", 3)) return tgtok::Dag; | 
 |    | 
 |   if (Len == 5 && !memcmp(IdentStart, "class", 5)) return tgtok::Class; | 
 |   if (Len == 3 && !memcmp(IdentStart, "def", 3)) return tgtok::Def; | 
 |   if (Len == 4 && !memcmp(IdentStart, "defm", 4)) return tgtok::Defm; | 
 |   if (Len == 10 && !memcmp(IdentStart, "multiclass", 10)) | 
 |     return tgtok::MultiClass; | 
 |   if (Len == 5 && !memcmp(IdentStart, "field", 5)) return tgtok::Field; | 
 |   if (Len == 3 && !memcmp(IdentStart, "let", 3)) return tgtok::Let; | 
 |   if (Len == 2 && !memcmp(IdentStart, "in", 2)) return tgtok::In; | 
 |    | 
 |   if (Len == 7 && !memcmp(IdentStart, "include", 7)) { | 
 |     if (LexInclude()) return tgtok::Error; | 
 |     return Lex(); | 
 |   } | 
 |      | 
 |   CurStrVal.assign(IdentStart, CurPtr); | 
 |   return tgtok::Id; | 
 | } | 
 |  | 
 | /// LexInclude - We just read the "include" token.  Get the string token that | 
 | /// comes next and enter the include. | 
 | bool TGLexer::LexInclude() { | 
 |   // The token after the include must be a string. | 
 |   tgtok::TokKind Tok = LexToken(); | 
 |   if (Tok == tgtok::Error) return true; | 
 |   if (Tok != tgtok::StrVal) { | 
 |     PrintError(getLoc(), "Expected filename after include"); | 
 |     return true; | 
 |   } | 
 |  | 
 |   // Get the string. | 
 |   std::string Filename = CurStrVal; | 
 |  | 
 |    | 
 |   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr)); | 
 |   if (CurBuffer == -1) { | 
 |     PrintError(getLoc(), "Could not find include file '" + Filename + "'"); | 
 |     return true; | 
 |   } | 
 |    | 
 |   // Save the line number and lex buffer of the includer. | 
 |   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); | 
 |   CurPtr = CurBuf->getBufferStart(); | 
 |   return false; | 
 | } | 
 |  | 
 | void TGLexer::SkipBCPLComment() { | 
 |   ++CurPtr;  // skip the second slash. | 
 |   while (1) { | 
 |     switch (*CurPtr) { | 
 |     case '\n': | 
 |     case '\r': | 
 |       return;  // Newline is end of comment. | 
 |     case 0: | 
 |       // If this is the end of the buffer, end the comment. | 
 |       if (CurPtr == CurBuf->getBufferEnd()) | 
 |         return; | 
 |       break; | 
 |     } | 
 |     // Otherwise, skip the character. | 
 |     ++CurPtr; | 
 |   } | 
 | } | 
 |  | 
 | /// SkipCComment - This skips C-style /**/ comments.  The only difference from C | 
 | /// is that we allow nesting. | 
 | bool TGLexer::SkipCComment() { | 
 |   ++CurPtr;  // skip the star. | 
 |   unsigned CommentDepth = 1; | 
 |    | 
 |   while (1) { | 
 |     int CurChar = getNextChar(); | 
 |     switch (CurChar) { | 
 |     case EOF: | 
 |       PrintError(TokStart, "Unterminated comment!"); | 
 |       return true; | 
 |     case '*': | 
 |       // End of the comment? | 
 |       if (CurPtr[0] != '/') break; | 
 |        | 
 |       ++CurPtr;   // End the */. | 
 |       if (--CommentDepth == 0) | 
 |         return false; | 
 |       break; | 
 |     case '/': | 
 |       // Start of a nested comment? | 
 |       if (CurPtr[0] != '*') break; | 
 |       ++CurPtr; | 
 |       ++CommentDepth; | 
 |       break; | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | /// LexNumber - Lex: | 
 | ///    [-+]?[0-9]+ | 
 | ///    0x[0-9a-fA-F]+ | 
 | ///    0b[01]+ | 
 | tgtok::TokKind TGLexer::LexNumber() { | 
 |   if (CurPtr[-1] == '0') { | 
 |     if (CurPtr[0] == 'x') { | 
 |       ++CurPtr; | 
 |       const char *NumStart = CurPtr; | 
 |       while (isxdigit(CurPtr[0])) | 
 |         ++CurPtr; | 
 |        | 
 |       // Requires at least one hex digit. | 
 |       if (CurPtr == NumStart) | 
 |         return ReturnError(TokStart, "Invalid hexadecimal number"); | 
 |  | 
 |       errno = 0; | 
 |       CurIntVal = strtoll(NumStart, 0, 16); | 
 |       if (errno == EINVAL) | 
 |         return ReturnError(TokStart, "Invalid hexadecimal number"); | 
 |       if (errno == ERANGE) { | 
 |         errno = 0; | 
 |         CurIntVal = (int64_t)strtoull(NumStart, 0, 16); | 
 |         if (errno == EINVAL) | 
 |           return ReturnError(TokStart, "Invalid hexadecimal number"); | 
 |         if (errno == ERANGE) | 
 |           return ReturnError(TokStart, "Hexadecimal number out of range"); | 
 |       } | 
 |       return tgtok::IntVal; | 
 |     } else if (CurPtr[0] == 'b') { | 
 |       ++CurPtr; | 
 |       const char *NumStart = CurPtr; | 
 |       while (CurPtr[0] == '0' || CurPtr[0] == '1') | 
 |         ++CurPtr; | 
 |  | 
 |       // Requires at least one binary digit. | 
 |       if (CurPtr == NumStart) | 
 |         return ReturnError(CurPtr-2, "Invalid binary number"); | 
 |       CurIntVal = strtoll(NumStart, 0, 2); | 
 |       return tgtok::IntVal; | 
 |     } | 
 |   } | 
 |  | 
 |   // Check for a sign without a digit. | 
 |   if (!isdigit(CurPtr[0])) { | 
 |     if (CurPtr[-1] == '-') | 
 |       return tgtok::minus; | 
 |     else if (CurPtr[-1] == '+') | 
 |       return tgtok::plus; | 
 |   } | 
 |    | 
 |   while (isdigit(CurPtr[0])) | 
 |     ++CurPtr; | 
 |   CurIntVal = strtoll(TokStart, 0, 10); | 
 |   return tgtok::IntVal; | 
 | } | 
 |  | 
 | /// LexBracket - We just read '['.  If this is a code block, return it, | 
 | /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' | 
 | tgtok::TokKind TGLexer::LexBracket() { | 
 |   if (CurPtr[0] != '{') | 
 |     return tgtok::l_square; | 
 |   ++CurPtr; | 
 |   const char *CodeStart = CurPtr; | 
 |   while (1) { | 
 |     int Char = getNextChar(); | 
 |     if (Char == EOF) break; | 
 |      | 
 |     if (Char != '}') continue; | 
 |      | 
 |     Char = getNextChar(); | 
 |     if (Char == EOF) break; | 
 |     if (Char == ']') { | 
 |       CurStrVal.assign(CodeStart, CurPtr-2); | 
 |       return tgtok::CodeFragment; | 
 |     } | 
 |   } | 
 |    | 
 |   return ReturnError(CodeStart-2, "Unterminated Code Block"); | 
 | } | 
 |  | 
 | /// LexExclaim - Lex '!' and '![a-zA-Z]+'. | 
 | tgtok::TokKind TGLexer::LexExclaim() { | 
 |   if (!isalpha(*CurPtr)) | 
 |     return ReturnError(CurPtr-1, "Invalid \"!operator\""); | 
 |    | 
 |   const char *Start = CurPtr++; | 
 |   while (isalpha(*CurPtr)) | 
 |     ++CurPtr; | 
 |    | 
 |   // Check to see which operator this is. | 
 |   unsigned Len = CurPtr-Start; | 
 |    | 
 |   if (Len == 3  && !memcmp(Start, "con", 3)) return tgtok::XConcat; | 
 |   if (Len == 3  && !memcmp(Start, "sra", 3)) return tgtok::XSRA; | 
 |   if (Len == 3  && !memcmp(Start, "srl", 3)) return tgtok::XSRL; | 
 |   if (Len == 3  && !memcmp(Start, "shl", 3)) return tgtok::XSHL; | 
 |   if (Len == 9  && !memcmp(Start, "strconcat", 9))   return tgtok::XStrConcat; | 
 |   if (Len == 10 && !memcmp(Start, "nameconcat", 10)) return tgtok::XNameConcat; | 
 |   if (Len == 5 && !memcmp(Start, "subst", 5)) return tgtok::XSubst; | 
 |   if (Len == 7 && !memcmp(Start, "foreach", 7)) return tgtok::XForEach; | 
 |   if (Len == 4 && !memcmp(Start, "cast", 4)) return tgtok::XCast; | 
 |   if (Len == 3 && !memcmp(Start, "car", 3)) return tgtok::XCar; | 
 |   if (Len == 3 && !memcmp(Start, "cdr", 3)) return tgtok::XCdr; | 
 |   if (Len == 4 && !memcmp(Start, "null", 4)) return tgtok::XNull; | 
 |   if (Len == 2 && !memcmp(Start, "if", 2)) return tgtok::XIf; | 
 |  | 
 |   return ReturnError(Start-1, "Unknown operator"); | 
 | } | 
 |  |