blob: 043acd77100fc4bba0e8c1cd7b1c6feff75f8abb [file] [log] [blame]
Chris Lattnere79379a2018-06-22 10:39:19 -07001//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Copyright 2019 The MLIR Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16// =============================================================================
17//
18// This file implements the lexer for the MLIR textual form.
19//
20//===----------------------------------------------------------------------===//
21
22#include "Lexer.h"
23#include "llvm/Support/SourceMgr.h"
Chris Lattner0497c4b2018-08-15 09:09:54 -070024#include <cctype>
Chris Lattnere79379a2018-06-22 10:39:19 -070025using namespace mlir;
26using llvm::SMLoc;
27using llvm::SourceMgr;
28
MLIR Teamf85a6262018-06-27 11:03:08 -070029// Returns true if 'c' is an allowable puncuation character: [$._-]
30// Returns false otherwise.
31static bool isPunct(char c) {
32 return c == '$' || c == '.' || c == '_' || c == '-';
33}
34
Jacques Pienaar0bffd862018-07-11 13:26:23 -070035Lexer::Lexer(llvm::SourceMgr &sourceMgr, SMDiagnosticHandlerTy errorReporter)
Jacques Pienaar9c411be2018-06-24 19:17:35 -070036 : sourceMgr(sourceMgr), errorReporter(errorReporter) {
Chris Lattnere79379a2018-06-22 10:39:19 -070037 auto bufferID = sourceMgr.getMainFileID();
38 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
39 curPtr = curBuffer.begin();
40}
41
42/// emitError - Emit an error message and return an Token::error token.
43Token Lexer::emitError(const char *loc, const Twine &message) {
Jacques Pienaar9c411be2018-06-24 19:17:35 -070044 errorReporter(sourceMgr.GetMessage(SMLoc::getFromPointer(loc),
45 SourceMgr::DK_Error, message));
Chris Lattnere79379a2018-06-22 10:39:19 -070046 return formToken(Token::error, loc);
47}
48
49Token Lexer::lexToken() {
50 const char *tokStart = curPtr;
51
52 switch (*curPtr++) {
53 default:
54 // Handle bare identifiers.
55 if (isalpha(curPtr[-1]))
56 return lexBareIdentifierOrKeyword(tokStart);
57
58 // Unknown character, emit an error.
59 return emitError(tokStart, "unexpected character");
60
Chris Lattneree0c2ae2018-07-29 12:37:35 -070061 case '_':
62 // Handle bare identifiers.
63 return lexBareIdentifierOrKeyword(tokStart);
64
Chris Lattnere79379a2018-06-22 10:39:19 -070065 case 0:
66 // This may either be a nul character in the source file or may be the EOF
67 // marker that llvm::MemoryBuffer guarantees will be there.
68 if (curPtr-1 == curBuffer.end())
69 return formToken(Token::eof, tokStart);
70
71 LLVM_FALLTHROUGH;
72 case ' ':
73 case '\t':
74 case '\n':
75 case '\r':
76 // Ignore whitespace.
77 return lexToken();
78
Chris Lattner4c95a502018-06-23 16:03:42 -070079 case ':': return formToken(Token::colon, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070080 case ',': return formToken(Token::comma, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070081 case '(': return formToken(Token::l_paren, tokStart);
82 case ')': return formToken(Token::r_paren, tokStart);
Chris Lattner4c95a502018-06-23 16:03:42 -070083 case '{': return formToken(Token::l_brace, tokStart);
84 case '}': return formToken(Token::r_brace, tokStart);
Chris Lattner85ee1512018-07-25 11:15:20 -070085 case '[':
86 return formToken(Token::l_square, tokStart);
87 case ']':
88 return formToken(Token::r_square, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070089 case '<': return formToken(Token::less, tokStart);
90 case '>': return formToken(Token::greater, tokStart);
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -070091 case '=': return formToken(Token::equal, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070092
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -070093 case '+': return formToken(Token::plus, tokStart);
94 case '*': return formToken(Token::star, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070095 case '-':
96 if (*curPtr == '>') {
97 ++curPtr;
98 return formToken(Token::arrow, tokStart);
99 }
Uday Bondhugula015cbb12018-07-03 20:16:08 -0700100 return formToken(Token::minus, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700101
102 case '?':
103 if (*curPtr == '?') {
104 ++curPtr;
105 return formToken(Token::questionquestion, tokStart);
106 }
107
108 return formToken(Token::question, tokStart);
109
Chris Lattner3e59f082018-07-14 23:06:24 -0700110 case '/':
111 if (*curPtr == '/')
112 return lexComment();
113 return emitError(tokStart, "unexpected character");
114
Uday Bondhugulabc535622018-08-07 14:24:38 -0700115 case '@':
116 if (*curPtr == '@') {
117 ++curPtr;
118 return lexDoubleAtIdentifier(tokStart);
119 }
120 return lexAtIdentifier(tokStart);
121
Chris Lattner78276e32018-07-07 15:48:26 -0700122 case '#':
123 LLVM_FALLTHROUGH;
124 case '%':
125 return lexPrefixedIdentifier(tokStart);
Chris Lattnered65a732018-06-28 20:45:33 -0700126 case '"': return lexString(tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700127
128 case '0': case '1': case '2': case '3': case '4':
129 case '5': case '6': case '7': case '8': case '9':
130 return lexNumber(tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -0700131 }
132}
133
134/// Lex a comment line, starting with a semicolon.
135///
136/// TODO: add a regex for comments here and to the spec.
137///
138Token Lexer::lexComment() {
Chris Lattner3e59f082018-07-14 23:06:24 -0700139 // Advance over the second '/' in a '//' comment.
140 assert(*curPtr == '/');
141 ++curPtr;
142
Chris Lattnere79379a2018-06-22 10:39:19 -0700143 while (true) {
144 switch (*curPtr++) {
145 case '\n':
146 case '\r':
147 // Newline is end of comment.
148 return lexToken();
149 case 0:
150 // If this is the end of the buffer, end the comment.
151 if (curPtr-1 == curBuffer.end()) {
152 --curPtr;
153 return lexToken();
154 }
155 LLVM_FALLTHROUGH;
156 default:
157 // Skip over other characters.
158 break;
159 }
160 }
161}
162
163/// Lex a bare identifier or keyword that starts with a letter.
164///
Jacques Pienaar4451c572018-07-31 15:40:09 -0700165/// bare-id ::= (letter|[_]) (letter|digit|[_$.])*
Chris Lattnerf958bbe2018-06-29 22:08:05 -0700166/// integer-type ::= `i[1-9][0-9]*`
Chris Lattnere79379a2018-06-22 10:39:19 -0700167///
168Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
Jacques Pienaar4451c572018-07-31 15:40:09 -0700169 // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
Jacques Pienaarc0d69302018-07-27 11:07:12 -0700170 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
Jacques Pienaar4451c572018-07-31 15:40:09 -0700171 *curPtr == '$' || *curPtr == '.')
Chris Lattnere79379a2018-06-22 10:39:19 -0700172 ++curPtr;
173
174 // Check to see if this identifier is a keyword.
175 StringRef spelling(tokStart, curPtr-tokStart);
176
Chris Lattnerf958bbe2018-06-29 22:08:05 -0700177 // Check for i123.
178 if (tokStart[0] == 'i') {
179 bool allDigits = true;
180 for (auto c : spelling.drop_front())
181 allDigits &= isdigit(c) != 0;
182 if (allDigits && spelling.size() != 1)
183 return Token(Token::inttype, spelling);
184 }
185
Chris Lattner8da0c282018-06-29 11:15:56 -0700186 Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
187#define TOK_KEYWORD(SPELLING) \
188 .Case(#SPELLING, Token::kw_##SPELLING)
189#include "TokenKinds.def"
Chris Lattnere79379a2018-06-22 10:39:19 -0700190 .Default(Token::bare_identifier);
191
192 return Token(kind, spelling);
193}
194
195/// Lex an '@foo' identifier.
196///
197/// function-id ::= `@` bare-id
198///
199Token Lexer::lexAtIdentifier(const char *tokStart) {
200 // These always start with a letter.
201 if (!isalpha(*curPtr++))
202 return emitError(curPtr-1, "expected letter in @ identifier");
203
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700204 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700205 ++curPtr;
206 return formToken(Token::at_identifier, tokStart);
207}
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700208
Uday Bondhugulabc535622018-08-07 14:24:38 -0700209/// Lex an '@@foo' identifier.
210///
211/// function-id ::= `@@` bare-id
212///
213Token Lexer::lexDoubleAtIdentifier(const char *tokStart) {
214 // These always start with a letter.
215 if (!isalpha(*curPtr++))
216 return emitError(curPtr - 1, "expected letter in @@ identifier");
217
218 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
219 ++curPtr;
220 return formToken(Token::double_at_identifier, tokStart);
221}
222
Chris Lattner78276e32018-07-07 15:48:26 -0700223/// Lex an identifier that starts with a prefix followed by suffix-id.
MLIR Teamf85a6262018-06-27 11:03:08 -0700224///
225/// affine-map-id ::= `#` suffix-id
Chris Lattner78276e32018-07-07 15:48:26 -0700226/// ssa-id ::= '%' suffix-id
MLIR Teamf85a6262018-06-27 11:03:08 -0700227/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
228///
Chris Lattner78276e32018-07-07 15:48:26 -0700229Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
230 Token::Kind kind;
231 StringRef errorKind;
232 switch (*tokStart) {
233 case '#':
234 kind = Token::hash_identifier;
235 errorKind = "invalid affine map name";
236 break;
237 case '%':
238 kind = Token::percent_identifier;
239 errorKind = "invalid SSA name";
240 break;
241 default:
242 llvm_unreachable("invalid caller");
243 }
244
MLIR Teamf85a6262018-06-27 11:03:08 -0700245 // Parse suffix-id.
246 if (isdigit(*curPtr)) {
247 // If suffix-id starts with a digit, the rest must be digits.
248 while (isdigit(*curPtr)) {
249 ++curPtr;
250 }
251 } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
252 do {
253 ++curPtr;
254 } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
255 } else {
Chris Lattner78276e32018-07-07 15:48:26 -0700256 return emitError(curPtr - 1, errorKind);
MLIR Teamf85a6262018-06-27 11:03:08 -0700257 }
Chris Lattner78276e32018-07-07 15:48:26 -0700258
259 return formToken(kind, tokStart);
MLIR Teamf85a6262018-06-27 11:03:08 -0700260}
261
Jacques Pienaar84491092018-07-31 17:15:15 -0700262/// Lex a number literal.
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700263///
264/// integer-literal ::= digit+ | `0x` hex_digit+
Jacques Pienaar84491092018-07-31 17:15:15 -0700265/// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700266///
267Token Lexer::lexNumber(const char *tokStart) {
268 assert(isdigit(curPtr[-1]));
269
270 // Handle the hexadecimal case.
271 if (curPtr[-1] == '0' && *curPtr == 'x') {
272 ++curPtr;
273
274 if (!isxdigit(*curPtr))
275 return emitError(curPtr, "expected hexadecimal digit");
276
277 while (isxdigit(*curPtr))
278 ++curPtr;
279
280 return formToken(Token::integer, tokStart);
281 }
282
283 // Handle the normal decimal case.
284 while (isdigit(*curPtr))
285 ++curPtr;
286
Jacques Pienaar84491092018-07-31 17:15:15 -0700287 if (*curPtr != '.')
288 return formToken(Token::integer, tokStart);
289 ++curPtr;
290
291 // Skip over [0-9]*([eE][-+]?[0-9]+)?
292 while (isdigit(*curPtr)) ++curPtr;
293
294 if (*curPtr == 'e' || *curPtr == 'E') {
295 if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
296 ((curPtr[1] == '-' || curPtr[1] == '+') &&
297 isdigit(static_cast<unsigned char>(curPtr[2])))) {
298 curPtr += 2;
299 while (isdigit(*curPtr)) ++curPtr;
300 }
301 }
302 return formToken(Token::floatliteral, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700303}
Chris Lattnered65a732018-06-28 20:45:33 -0700304
305/// Lex a string literal.
306///
307/// string-literal ::= '"' [^"\n\f\v\r]* '"'
308///
309/// TODO: define escaping rules.
310Token Lexer::lexString(const char *tokStart) {
311 assert(curPtr[-1] == '"');
312
313 while (1) {
314 switch (*curPtr++) {
315 case '"':
316 return formToken(Token::string, tokStart);
317 case '0':
318 // If this is a random nul character in the middle of a string, just
319 // include it. If it is the end of file, then it is an error.
320 if (curPtr-1 != curBuffer.end())
321 continue;
322 LLVM_FALLTHROUGH;
323 case '\n':
324 case '\v':
325 case '\f':
326 return emitError(curPtr-1, "expected '\"' in string literal");
James Molloy3cdb8aa2018-08-14 01:16:45 -0700327 case '\\':
Chris Lattner0497c4b2018-08-15 09:09:54 -0700328 // Handle explicitly a few escapes.
329 if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
James Molloy3cdb8aa2018-08-14 01:16:45 -0700330 ++curPtr;
Chris Lattner0497c4b2018-08-15 09:09:54 -0700331 else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
332 // Support \xx for two hex digits.
333 curPtr += 2;
334 else
335 return emitError(curPtr - 1, "unknown escape in string literal");
James Molloy3cdb8aa2018-08-14 01:16:45 -0700336 continue;
Chris Lattnered65a732018-06-28 20:45:33 -0700337
338 default:
339 continue;
340 }
341 }
342}