blob: b4f8e1db6b530ee523cf4027b2393e5fa80a9f9e [file] [log] [blame]
Chris Lattnere79379a2018-06-22 10:39:19 -07001//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Copyright 2019 The MLIR Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16// =============================================================================
17//
18// This file implements the lexer for the MLIR textual form.
19//
20//===----------------------------------------------------------------------===//
21
22#include "Lexer.h"
Chris Lattner7879f842018-09-02 22:01:45 -070023#include "mlir/IR/Location.h"
24#include "mlir/IR/MLIRContext.h"
Chris Lattnere79379a2018-06-22 10:39:19 -070025#include "llvm/Support/SourceMgr.h"
26using namespace mlir;
27using llvm::SMLoc;
28using llvm::SourceMgr;
29
MLIR Teamf85a6262018-06-27 11:03:08 -070030// Returns true if 'c' is an allowable puncuation character: [$._-]
31// Returns false otherwise.
32static bool isPunct(char c) {
33 return c == '$' || c == '.' || c == '_' || c == '-';
34}
35
Chris Lattner7879f842018-09-02 22:01:45 -070036Lexer::Lexer(llvm::SourceMgr &sourceMgr, MLIRContext *context)
37 : sourceMgr(sourceMgr), context(context) {
Chris Lattnere79379a2018-06-22 10:39:19 -070038 auto bufferID = sourceMgr.getMainFileID();
39 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
40 curPtr = curBuffer.begin();
41}
42
Chris Lattner7879f842018-09-02 22:01:45 -070043/// Encode the specified source location information into an attribute for
44/// attachment to the IR.
45Location *Lexer::getEncodedSourceLocation(llvm::SMLoc loc) {
46 auto &sourceMgr = getSourceMgr();
47 unsigned mainFileID = sourceMgr.getMainFileID();
48 auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID);
49 auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);
50 auto filename = UniquedFilename::get(buffer->getBufferIdentifier(), context);
51
52 return FileLineColLoc::get(filename, lineAndColumn.first,
53 lineAndColumn.second, context);
54}
55
Chris Lattnere79379a2018-06-22 10:39:19 -070056/// emitError - Emit an error message and return an Token::error token.
57Token Lexer::emitError(const char *loc, const Twine &message) {
Chris Lattner7879f842018-09-02 22:01:45 -070058 context->emitDiagnostic(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
59 message, MLIRContext::DiagnosticKind::Error);
Chris Lattnere79379a2018-06-22 10:39:19 -070060 return formToken(Token::error, loc);
61}
62
63Token Lexer::lexToken() {
64 const char *tokStart = curPtr;
65
66 switch (*curPtr++) {
67 default:
68 // Handle bare identifiers.
69 if (isalpha(curPtr[-1]))
70 return lexBareIdentifierOrKeyword(tokStart);
71
72 // Unknown character, emit an error.
73 return emitError(tokStart, "unexpected character");
74
Chris Lattneree0c2ae2018-07-29 12:37:35 -070075 case '_':
76 // Handle bare identifiers.
77 return lexBareIdentifierOrKeyword(tokStart);
78
Chris Lattnere79379a2018-06-22 10:39:19 -070079 case 0:
80 // This may either be a nul character in the source file or may be the EOF
81 // marker that llvm::MemoryBuffer guarantees will be there.
82 if (curPtr-1 == curBuffer.end())
83 return formToken(Token::eof, tokStart);
84
85 LLVM_FALLTHROUGH;
86 case ' ':
87 case '\t':
88 case '\n':
89 case '\r':
90 // Ignore whitespace.
91 return lexToken();
92
Chris Lattner4c95a502018-06-23 16:03:42 -070093 case ':': return formToken(Token::colon, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070094 case ',': return formToken(Token::comma, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070095 case '(': return formToken(Token::l_paren, tokStart);
96 case ')': return formToken(Token::r_paren, tokStart);
Chris Lattner4c95a502018-06-23 16:03:42 -070097 case '{': return formToken(Token::l_brace, tokStart);
98 case '}': return formToken(Token::r_brace, tokStart);
Chris Lattner85ee1512018-07-25 11:15:20 -070099 case '[':
100 return formToken(Token::l_square, tokStart);
101 case ']':
102 return formToken(Token::r_square, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -0700103 case '<': return formToken(Token::less, tokStart);
104 case '>': return formToken(Token::greater, tokStart);
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -0700105 case '=': return formToken(Token::equal, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -0700106
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -0700107 case '+': return formToken(Token::plus, tokStart);
108 case '*': return formToken(Token::star, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700109 case '-':
110 if (*curPtr == '>') {
111 ++curPtr;
112 return formToken(Token::arrow, tokStart);
113 }
Uday Bondhugula015cbb12018-07-03 20:16:08 -0700114 return formToken(Token::minus, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700115
116 case '?':
117 if (*curPtr == '?') {
118 ++curPtr;
119 return formToken(Token::questionquestion, tokStart);
120 }
121
122 return formToken(Token::question, tokStart);
123
Chris Lattner3e59f082018-07-14 23:06:24 -0700124 case '/':
125 if (*curPtr == '/')
126 return lexComment();
127 return emitError(tokStart, "unexpected character");
128
Uday Bondhugulabc535622018-08-07 14:24:38 -0700129 case '@':
130 if (*curPtr == '@') {
131 ++curPtr;
132 return lexDoubleAtIdentifier(tokStart);
133 }
134 return lexAtIdentifier(tokStart);
135
Chris Lattner78276e32018-07-07 15:48:26 -0700136 case '#':
137 LLVM_FALLTHROUGH;
138 case '%':
139 return lexPrefixedIdentifier(tokStart);
Chris Lattnered65a732018-06-28 20:45:33 -0700140 case '"': return lexString(tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700141
142 case '0': case '1': case '2': case '3': case '4':
143 case '5': case '6': case '7': case '8': case '9':
144 return lexNumber(tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -0700145 }
146}
147
148/// Lex a comment line, starting with a semicolon.
149///
150/// TODO: add a regex for comments here and to the spec.
151///
152Token Lexer::lexComment() {
Chris Lattner3e59f082018-07-14 23:06:24 -0700153 // Advance over the second '/' in a '//' comment.
154 assert(*curPtr == '/');
155 ++curPtr;
156
Chris Lattnere79379a2018-06-22 10:39:19 -0700157 while (true) {
158 switch (*curPtr++) {
159 case '\n':
160 case '\r':
161 // Newline is end of comment.
162 return lexToken();
163 case 0:
164 // If this is the end of the buffer, end the comment.
165 if (curPtr-1 == curBuffer.end()) {
166 --curPtr;
167 return lexToken();
168 }
169 LLVM_FALLTHROUGH;
170 default:
171 // Skip over other characters.
172 break;
173 }
174 }
175}
176
177/// Lex a bare identifier or keyword that starts with a letter.
178///
Jacques Pienaar4451c572018-07-31 15:40:09 -0700179/// bare-id ::= (letter|[_]) (letter|digit|[_$.])*
Chris Lattnerf958bbe2018-06-29 22:08:05 -0700180/// integer-type ::= `i[1-9][0-9]*`
Chris Lattnere79379a2018-06-22 10:39:19 -0700181///
182Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
Jacques Pienaar4451c572018-07-31 15:40:09 -0700183 // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
Jacques Pienaarc0d69302018-07-27 11:07:12 -0700184 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
Jacques Pienaar4451c572018-07-31 15:40:09 -0700185 *curPtr == '$' || *curPtr == '.')
Chris Lattnere79379a2018-06-22 10:39:19 -0700186 ++curPtr;
187
188 // Check to see if this identifier is a keyword.
189 StringRef spelling(tokStart, curPtr-tokStart);
190
Chris Lattnerf958bbe2018-06-29 22:08:05 -0700191 // Check for i123.
192 if (tokStart[0] == 'i') {
193 bool allDigits = true;
194 for (auto c : spelling.drop_front())
195 allDigits &= isdigit(c) != 0;
196 if (allDigits && spelling.size() != 1)
197 return Token(Token::inttype, spelling);
198 }
199
Chris Lattner8da0c282018-06-29 11:15:56 -0700200 Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
201#define TOK_KEYWORD(SPELLING) \
202 .Case(#SPELLING, Token::kw_##SPELLING)
203#include "TokenKinds.def"
Chris Lattnere79379a2018-06-22 10:39:19 -0700204 .Default(Token::bare_identifier);
205
206 return Token(kind, spelling);
207}
208
209/// Lex an '@foo' identifier.
210///
211/// function-id ::= `@` bare-id
212///
213Token Lexer::lexAtIdentifier(const char *tokStart) {
214 // These always start with a letter.
215 if (!isalpha(*curPtr++))
216 return emitError(curPtr-1, "expected letter in @ identifier");
217
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700218 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700219 ++curPtr;
220 return formToken(Token::at_identifier, tokStart);
221}
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700222
Uday Bondhugulabc535622018-08-07 14:24:38 -0700223/// Lex an '@@foo' identifier.
224///
225/// function-id ::= `@@` bare-id
226///
227Token Lexer::lexDoubleAtIdentifier(const char *tokStart) {
228 // These always start with a letter.
229 if (!isalpha(*curPtr++))
230 return emitError(curPtr - 1, "expected letter in @@ identifier");
231
232 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
233 ++curPtr;
234 return formToken(Token::double_at_identifier, tokStart);
235}
236
Chris Lattner78276e32018-07-07 15:48:26 -0700237/// Lex an identifier that starts with a prefix followed by suffix-id.
MLIR Teamf85a6262018-06-27 11:03:08 -0700238///
239/// affine-map-id ::= `#` suffix-id
Chris Lattner78276e32018-07-07 15:48:26 -0700240/// ssa-id ::= '%' suffix-id
MLIR Teamf85a6262018-06-27 11:03:08 -0700241/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
242///
Chris Lattner78276e32018-07-07 15:48:26 -0700243Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
244 Token::Kind kind;
245 StringRef errorKind;
246 switch (*tokStart) {
247 case '#':
248 kind = Token::hash_identifier;
249 errorKind = "invalid affine map name";
250 break;
251 case '%':
252 kind = Token::percent_identifier;
253 errorKind = "invalid SSA name";
254 break;
255 default:
256 llvm_unreachable("invalid caller");
257 }
258
MLIR Teamf85a6262018-06-27 11:03:08 -0700259 // Parse suffix-id.
260 if (isdigit(*curPtr)) {
261 // If suffix-id starts with a digit, the rest must be digits.
262 while (isdigit(*curPtr)) {
263 ++curPtr;
264 }
265 } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
266 do {
267 ++curPtr;
268 } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
269 } else {
Chris Lattner78276e32018-07-07 15:48:26 -0700270 return emitError(curPtr - 1, errorKind);
MLIR Teamf85a6262018-06-27 11:03:08 -0700271 }
Chris Lattner78276e32018-07-07 15:48:26 -0700272
273 return formToken(kind, tokStart);
MLIR Teamf85a6262018-06-27 11:03:08 -0700274}
275
Jacques Pienaar84491092018-07-31 17:15:15 -0700276/// Lex a number literal.
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700277///
278/// integer-literal ::= digit+ | `0x` hex_digit+
Jacques Pienaar84491092018-07-31 17:15:15 -0700279/// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700280///
281Token Lexer::lexNumber(const char *tokStart) {
282 assert(isdigit(curPtr[-1]));
283
284 // Handle the hexadecimal case.
285 if (curPtr[-1] == '0' && *curPtr == 'x') {
286 ++curPtr;
287
288 if (!isxdigit(*curPtr))
289 return emitError(curPtr, "expected hexadecimal digit");
290
291 while (isxdigit(*curPtr))
292 ++curPtr;
293
294 return formToken(Token::integer, tokStart);
295 }
296
297 // Handle the normal decimal case.
298 while (isdigit(*curPtr))
299 ++curPtr;
300
Jacques Pienaar84491092018-07-31 17:15:15 -0700301 if (*curPtr != '.')
302 return formToken(Token::integer, tokStart);
303 ++curPtr;
304
305 // Skip over [0-9]*([eE][-+]?[0-9]+)?
306 while (isdigit(*curPtr)) ++curPtr;
307
308 if (*curPtr == 'e' || *curPtr == 'E') {
309 if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
310 ((curPtr[1] == '-' || curPtr[1] == '+') &&
311 isdigit(static_cast<unsigned char>(curPtr[2])))) {
312 curPtr += 2;
313 while (isdigit(*curPtr)) ++curPtr;
314 }
315 }
316 return formToken(Token::floatliteral, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700317}
Chris Lattnered65a732018-06-28 20:45:33 -0700318
319/// Lex a string literal.
320///
321/// string-literal ::= '"' [^"\n\f\v\r]* '"'
322///
323/// TODO: define escaping rules.
324Token Lexer::lexString(const char *tokStart) {
325 assert(curPtr[-1] == '"');
326
327 while (1) {
328 switch (*curPtr++) {
329 case '"':
330 return formToken(Token::string, tokStart);
331 case '0':
332 // If this is a random nul character in the middle of a string, just
333 // include it. If it is the end of file, then it is an error.
334 if (curPtr-1 != curBuffer.end())
335 continue;
336 LLVM_FALLTHROUGH;
337 case '\n':
338 case '\v':
339 case '\f':
340 return emitError(curPtr-1, "expected '\"' in string literal");
James Molloy3cdb8aa2018-08-14 01:16:45 -0700341 case '\\':
Chris Lattner0497c4b2018-08-15 09:09:54 -0700342 // Handle explicitly a few escapes.
343 if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
James Molloy3cdb8aa2018-08-14 01:16:45 -0700344 ++curPtr;
Chris Lattner0497c4b2018-08-15 09:09:54 -0700345 else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
346 // Support \xx for two hex digits.
347 curPtr += 2;
348 else
349 return emitError(curPtr - 1, "unknown escape in string literal");
James Molloy3cdb8aa2018-08-14 01:16:45 -0700350 continue;
Chris Lattnered65a732018-06-28 20:45:33 -0700351
352 default:
353 continue;
354 }
355 }
356}