blob: b6473f523ebd6e066b6b9ae46474da01d28d870a [file] [log] [blame]
Chris Lattnere79379a2018-06-22 10:39:19 -07001//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Copyright 2019 The MLIR Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16// =============================================================================
17//
18// This file implements the lexer for the MLIR textual form.
19//
20//===----------------------------------------------------------------------===//
21
22#include "Lexer.h"
23#include "llvm/Support/SourceMgr.h"
24using namespace mlir;
25using llvm::SMLoc;
26using llvm::SourceMgr;
27
MLIR Teamf85a6262018-06-27 11:03:08 -070028// Returns true if 'c' is an allowable puncuation character: [$._-]
29// Returns false otherwise.
30static bool isPunct(char c) {
31 return c == '$' || c == '.' || c == '_' || c == '-';
32}
33
Jacques Pienaar9c411be2018-06-24 19:17:35 -070034Lexer::Lexer(llvm::SourceMgr &sourceMgr,
35 const SMDiagnosticHandlerTy &errorReporter)
36 : sourceMgr(sourceMgr), errorReporter(errorReporter) {
Chris Lattnere79379a2018-06-22 10:39:19 -070037 auto bufferID = sourceMgr.getMainFileID();
38 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
39 curPtr = curBuffer.begin();
40}
41
42/// emitError - Emit an error message and return an Token::error token.
43Token Lexer::emitError(const char *loc, const Twine &message) {
Jacques Pienaar9c411be2018-06-24 19:17:35 -070044 errorReporter(sourceMgr.GetMessage(SMLoc::getFromPointer(loc),
45 SourceMgr::DK_Error, message));
Chris Lattnere79379a2018-06-22 10:39:19 -070046 return formToken(Token::error, loc);
47}
48
49Token Lexer::lexToken() {
50 const char *tokStart = curPtr;
51
52 switch (*curPtr++) {
53 default:
54 // Handle bare identifiers.
55 if (isalpha(curPtr[-1]))
56 return lexBareIdentifierOrKeyword(tokStart);
57
58 // Unknown character, emit an error.
59 return emitError(tokStart, "unexpected character");
60
61 case 0:
62 // This may either be a nul character in the source file or may be the EOF
63 // marker that llvm::MemoryBuffer guarantees will be there.
64 if (curPtr-1 == curBuffer.end())
65 return formToken(Token::eof, tokStart);
66
67 LLVM_FALLTHROUGH;
68 case ' ':
69 case '\t':
70 case '\n':
71 case '\r':
72 // Ignore whitespace.
73 return lexToken();
74
Chris Lattner4c95a502018-06-23 16:03:42 -070075 case ':': return formToken(Token::colon, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070076 case ',': return formToken(Token::comma, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070077 case '(': return formToken(Token::l_paren, tokStart);
78 case ')': return formToken(Token::r_paren, tokStart);
Chris Lattner4c95a502018-06-23 16:03:42 -070079 case '{': return formToken(Token::l_brace, tokStart);
80 case '}': return formToken(Token::r_brace, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070081 case '<': return formToken(Token::less, tokStart);
82 case '>': return formToken(Token::greater, tokStart);
83
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070084 case '-':
85 if (*curPtr == '>') {
86 ++curPtr;
87 return formToken(Token::arrow, tokStart);
88 }
89 return emitError(tokStart, "unexpected character");
90
91 case '?':
92 if (*curPtr == '?') {
93 ++curPtr;
94 return formToken(Token::questionquestion, tokStart);
95 }
96
97 return formToken(Token::question, tokStart);
98
Chris Lattnere79379a2018-06-22 10:39:19 -070099 case ';': return lexComment();
100 case '@': return lexAtIdentifier(tokStart);
MLIR Teamf85a6262018-06-27 11:03:08 -0700101 case '#': return lexAffineMapId(tokStart);
Chris Lattnered65a732018-06-28 20:45:33 -0700102 case '"': return lexString(tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700103
104 case '0': case '1': case '2': case '3': case '4':
105 case '5': case '6': case '7': case '8': case '9':
106 return lexNumber(tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -0700107 }
108}
109
110/// Lex a comment line, starting with a semicolon.
111///
112/// TODO: add a regex for comments here and to the spec.
113///
114Token Lexer::lexComment() {
115 while (true) {
116 switch (*curPtr++) {
117 case '\n':
118 case '\r':
119 // Newline is end of comment.
120 return lexToken();
121 case 0:
122 // If this is the end of the buffer, end the comment.
123 if (curPtr-1 == curBuffer.end()) {
124 --curPtr;
125 return lexToken();
126 }
127 LLVM_FALLTHROUGH;
128 default:
129 // Skip over other characters.
130 break;
131 }
132 }
133}
134
135/// Lex a bare identifier or keyword that starts with a letter.
136///
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700137/// bare-id ::= letter (letter|digit|[_])*
Chris Lattnere79379a2018-06-22 10:39:19 -0700138///
139Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700140 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
141 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700142 ++curPtr;
143
144 // Check to see if this identifier is a keyword.
145 StringRef spelling(tokStart, curPtr-tokStart);
146
147 Token::TokenKind kind = llvm::StringSwitch<Token::TokenKind>(spelling)
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700148 .Case("bf16", Token::kw_bf16)
149 .Case("br", Token::kw_br)
Chris Lattnere79379a2018-06-22 10:39:19 -0700150 .Case("cfgfunc", Token::kw_cfgfunc)
151 .Case("extfunc", Token::kw_extfunc)
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700152 .Case("f16", Token::kw_f16)
153 .Case("f32", Token::kw_f32)
154 .Case("f64", Token::kw_f64)
155 .Case("i1", Token::kw_i1)
156 .Case("i16", Token::kw_i16)
157 .Case("i32", Token::kw_i32)
158 .Case("i64", Token::kw_i64)
159 .Case("i8", Token::kw_i8)
160 .Case("int", Token::kw_int)
161 .Case("memref", Token::kw_memref)
Chris Lattnere79379a2018-06-22 10:39:19 -0700162 .Case("mlfunc", Token::kw_mlfunc)
Chris Lattner4c95a502018-06-23 16:03:42 -0700163 .Case("return", Token::kw_return)
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700164 .Case("tensor", Token::kw_tensor)
165 .Case("vector", Token::kw_vector)
Chris Lattnere79379a2018-06-22 10:39:19 -0700166 .Default(Token::bare_identifier);
167
168 return Token(kind, spelling);
169}
170
171/// Lex an '@foo' identifier.
172///
173/// function-id ::= `@` bare-id
174///
175Token Lexer::lexAtIdentifier(const char *tokStart) {
176 // These always start with a letter.
177 if (!isalpha(*curPtr++))
178 return emitError(curPtr-1, "expected letter in @ identifier");
179
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700180 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700181 ++curPtr;
182 return formToken(Token::at_identifier, tokStart);
183}
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700184
MLIR Teamf85a6262018-06-27 11:03:08 -0700185/// Lex an '#foo' identifier.
186///
187/// affine-map-id ::= `#` suffix-id
188/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
189///
190// TODO(andydavis) Consider moving suffix-id parsing to a shared function
191// so it can be re-used to parse %suffix-id.
192Token Lexer::lexAffineMapId(const char *tokStart) {
193 // Parse suffix-id.
194 if (isdigit(*curPtr)) {
195 // If suffix-id starts with a digit, the rest must be digits.
196 while (isdigit(*curPtr)) {
197 ++curPtr;
198 }
199 } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
200 do {
201 ++curPtr;
202 } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
203 } else {
204 return emitError(curPtr-1, "invalid affine map id");
205 }
206 return formToken(Token::affine_map_id, tokStart);
207}
208
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700209/// Lex an integer literal.
210///
211/// integer-literal ::= digit+ | `0x` hex_digit+
212///
213Token Lexer::lexNumber(const char *tokStart) {
214 assert(isdigit(curPtr[-1]));
215
216 // Handle the hexadecimal case.
217 if (curPtr[-1] == '0' && *curPtr == 'x') {
218 ++curPtr;
219
220 if (!isxdigit(*curPtr))
221 return emitError(curPtr, "expected hexadecimal digit");
222
223 while (isxdigit(*curPtr))
224 ++curPtr;
225
226 return formToken(Token::integer, tokStart);
227 }
228
229 // Handle the normal decimal case.
230 while (isdigit(*curPtr))
231 ++curPtr;
232
233 return formToken(Token::integer, tokStart);
234}
Chris Lattnered65a732018-06-28 20:45:33 -0700235
236/// Lex a string literal.
237///
238/// string-literal ::= '"' [^"\n\f\v\r]* '"'
239///
240/// TODO: define escaping rules.
241Token Lexer::lexString(const char *tokStart) {
242 assert(curPtr[-1] == '"');
243
244 while (1) {
245 switch (*curPtr++) {
246 case '"':
247 return formToken(Token::string, tokStart);
248 case '0':
249 // If this is a random nul character in the middle of a string, just
250 // include it. If it is the end of file, then it is an error.
251 if (curPtr-1 != curBuffer.end())
252 continue;
253 LLVM_FALLTHROUGH;
254 case '\n':
255 case '\v':
256 case '\f':
257 return emitError(curPtr-1, "expected '\"' in string literal");
258
259 default:
260 continue;
261 }
262 }
263}