blob: 7f53886e889896af4e44b91f8520b81166751916 [file] [log] [blame]
Chris Lattnere79379a2018-06-22 10:39:19 -07001//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Copyright 2019 The MLIR Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16// =============================================================================
17//
18// This file implements the lexer for the MLIR textual form.
19//
20//===----------------------------------------------------------------------===//
21
22#include "Lexer.h"
23#include "llvm/Support/SourceMgr.h"
24using namespace mlir;
25using llvm::SMLoc;
26using llvm::SourceMgr;
27
Jacques Pienaar9c411be2018-06-24 19:17:35 -070028Lexer::Lexer(llvm::SourceMgr &sourceMgr,
29 const SMDiagnosticHandlerTy &errorReporter)
30 : sourceMgr(sourceMgr), errorReporter(errorReporter) {
Chris Lattnere79379a2018-06-22 10:39:19 -070031 auto bufferID = sourceMgr.getMainFileID();
32 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
33 curPtr = curBuffer.begin();
34}
35
36/// emitError - Emit an error message and return an Token::error token.
37Token Lexer::emitError(const char *loc, const Twine &message) {
Jacques Pienaar9c411be2018-06-24 19:17:35 -070038 errorReporter(sourceMgr.GetMessage(SMLoc::getFromPointer(loc),
39 SourceMgr::DK_Error, message));
Chris Lattnere79379a2018-06-22 10:39:19 -070040 return formToken(Token::error, loc);
41}
42
43Token Lexer::lexToken() {
44 const char *tokStart = curPtr;
45
46 switch (*curPtr++) {
47 default:
48 // Handle bare identifiers.
49 if (isalpha(curPtr[-1]))
50 return lexBareIdentifierOrKeyword(tokStart);
51
52 // Unknown character, emit an error.
53 return emitError(tokStart, "unexpected character");
54
55 case 0:
56 // This may either be a nul character in the source file or may be the EOF
57 // marker that llvm::MemoryBuffer guarantees will be there.
58 if (curPtr-1 == curBuffer.end())
59 return formToken(Token::eof, tokStart);
60
61 LLVM_FALLTHROUGH;
62 case ' ':
63 case '\t':
64 case '\n':
65 case '\r':
66 // Ignore whitespace.
67 return lexToken();
68
Chris Lattner4c95a502018-06-23 16:03:42 -070069 case ':': return formToken(Token::colon, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070070 case ',': return formToken(Token::comma, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070071 case '(': return formToken(Token::l_paren, tokStart);
72 case ')': return formToken(Token::r_paren, tokStart);
Chris Lattner4c95a502018-06-23 16:03:42 -070073 case '{': return formToken(Token::l_brace, tokStart);
74 case '}': return formToken(Token::r_brace, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070075 case '<': return formToken(Token::less, tokStart);
76 case '>': return formToken(Token::greater, tokStart);
77
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070078 case '-':
79 if (*curPtr == '>') {
80 ++curPtr;
81 return formToken(Token::arrow, tokStart);
82 }
83 return emitError(tokStart, "unexpected character");
84
85 case '?':
86 if (*curPtr == '?') {
87 ++curPtr;
88 return formToken(Token::questionquestion, tokStart);
89 }
90
91 return formToken(Token::question, tokStart);
92
Chris Lattnere79379a2018-06-22 10:39:19 -070093 case ';': return lexComment();
94 case '@': return lexAtIdentifier(tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070095
96 case '0': case '1': case '2': case '3': case '4':
97 case '5': case '6': case '7': case '8': case '9':
98 return lexNumber(tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070099 }
100}
101
102/// Lex a comment line, starting with a semicolon.
103///
104/// TODO: add a regex for comments here and to the spec.
105///
106Token Lexer::lexComment() {
107 while (true) {
108 switch (*curPtr++) {
109 case '\n':
110 case '\r':
111 // Newline is end of comment.
112 return lexToken();
113 case 0:
114 // If this is the end of the buffer, end the comment.
115 if (curPtr-1 == curBuffer.end()) {
116 --curPtr;
117 return lexToken();
118 }
119 LLVM_FALLTHROUGH;
120 default:
121 // Skip over other characters.
122 break;
123 }
124 }
125}
126
127/// Lex a bare identifier or keyword that starts with a letter.
128///
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700129/// bare-id ::= letter (letter|digit|[_])*
Chris Lattnere79379a2018-06-22 10:39:19 -0700130///
131Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700132 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
133 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700134 ++curPtr;
135
136 // Check to see if this identifier is a keyword.
137 StringRef spelling(tokStart, curPtr-tokStart);
138
139 Token::TokenKind kind = llvm::StringSwitch<Token::TokenKind>(spelling)
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700140 .Case("bf16", Token::kw_bf16)
141 .Case("br", Token::kw_br)
Chris Lattnere79379a2018-06-22 10:39:19 -0700142 .Case("cfgfunc", Token::kw_cfgfunc)
143 .Case("extfunc", Token::kw_extfunc)
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700144 .Case("f16", Token::kw_f16)
145 .Case("f32", Token::kw_f32)
146 .Case("f64", Token::kw_f64)
147 .Case("i1", Token::kw_i1)
148 .Case("i16", Token::kw_i16)
149 .Case("i32", Token::kw_i32)
150 .Case("i64", Token::kw_i64)
151 .Case("i8", Token::kw_i8)
152 .Case("int", Token::kw_int)
153 .Case("memref", Token::kw_memref)
Chris Lattnere79379a2018-06-22 10:39:19 -0700154 .Case("mlfunc", Token::kw_mlfunc)
Chris Lattner4c95a502018-06-23 16:03:42 -0700155 .Case("return", Token::kw_return)
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700156 .Case("tensor", Token::kw_tensor)
157 .Case("vector", Token::kw_vector)
Chris Lattnere79379a2018-06-22 10:39:19 -0700158 .Default(Token::bare_identifier);
159
160 return Token(kind, spelling);
161}
162
163/// Lex an '@foo' identifier.
164///
165/// function-id ::= `@` bare-id
166///
167Token Lexer::lexAtIdentifier(const char *tokStart) {
168 // These always start with a letter.
169 if (!isalpha(*curPtr++))
170 return emitError(curPtr-1, "expected letter in @ identifier");
171
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700172 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700173 ++curPtr;
174 return formToken(Token::at_identifier, tokStart);
175}
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700176
177/// Lex an integer literal.
178///
179/// integer-literal ::= digit+ | `0x` hex_digit+
180///
181Token Lexer::lexNumber(const char *tokStart) {
182 assert(isdigit(curPtr[-1]));
183
184 // Handle the hexadecimal case.
185 if (curPtr[-1] == '0' && *curPtr == 'x') {
186 ++curPtr;
187
188 if (!isxdigit(*curPtr))
189 return emitError(curPtr, "expected hexadecimal digit");
190
191 while (isxdigit(*curPtr))
192 ++curPtr;
193
194 return formToken(Token::integer, tokStart);
195 }
196
197 // Handle the normal decimal case.
198 while (isdigit(*curPtr))
199 ++curPtr;
200
201 return formToken(Token::integer, tokStart);
202}