blob: 91fa8ad666af2113ee683b5f5c5c1ac5ea286eed [file] [log] [blame]
Chris Lattnere79379a2018-06-22 10:39:19 -07001//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Copyright 2019 The MLIR Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16// =============================================================================
17//
18// This file implements the lexer for the MLIR textual form.
19//
20//===----------------------------------------------------------------------===//
21
22#include "Lexer.h"
23#include "llvm/Support/SourceMgr.h"
24using namespace mlir;
25using llvm::SMLoc;
26using llvm::SourceMgr;
27
MLIR Teamf85a6262018-06-27 11:03:08 -070028// Returns true if 'c' is an allowable puncuation character: [$._-]
29// Returns false otherwise.
30static bool isPunct(char c) {
31 return c == '$' || c == '.' || c == '_' || c == '-';
32}
33
Jacques Pienaar0bffd862018-07-11 13:26:23 -070034Lexer::Lexer(llvm::SourceMgr &sourceMgr, SMDiagnosticHandlerTy errorReporter)
Jacques Pienaar9c411be2018-06-24 19:17:35 -070035 : sourceMgr(sourceMgr), errorReporter(errorReporter) {
Chris Lattnere79379a2018-06-22 10:39:19 -070036 auto bufferID = sourceMgr.getMainFileID();
37 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
38 curPtr = curBuffer.begin();
39}
40
41/// emitError - Emit an error message and return an Token::error token.
42Token Lexer::emitError(const char *loc, const Twine &message) {
Jacques Pienaar9c411be2018-06-24 19:17:35 -070043 errorReporter(sourceMgr.GetMessage(SMLoc::getFromPointer(loc),
44 SourceMgr::DK_Error, message));
Chris Lattnere79379a2018-06-22 10:39:19 -070045 return formToken(Token::error, loc);
46}
47
48Token Lexer::lexToken() {
49 const char *tokStart = curPtr;
50
51 switch (*curPtr++) {
52 default:
53 // Handle bare identifiers.
54 if (isalpha(curPtr[-1]))
55 return lexBareIdentifierOrKeyword(tokStart);
56
57 // Unknown character, emit an error.
58 return emitError(tokStart, "unexpected character");
59
Chris Lattneree0c2ae2018-07-29 12:37:35 -070060 case '_':
61 // Handle bare identifiers.
62 return lexBareIdentifierOrKeyword(tokStart);
63
Chris Lattnere79379a2018-06-22 10:39:19 -070064 case 0:
65 // This may either be a nul character in the source file or may be the EOF
66 // marker that llvm::MemoryBuffer guarantees will be there.
67 if (curPtr-1 == curBuffer.end())
68 return formToken(Token::eof, tokStart);
69
70 LLVM_FALLTHROUGH;
71 case ' ':
72 case '\t':
73 case '\n':
74 case '\r':
75 // Ignore whitespace.
76 return lexToken();
77
Chris Lattner4c95a502018-06-23 16:03:42 -070078 case ':': return formToken(Token::colon, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070079 case ',': return formToken(Token::comma, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070080 case '(': return formToken(Token::l_paren, tokStart);
81 case ')': return formToken(Token::r_paren, tokStart);
Chris Lattner4c95a502018-06-23 16:03:42 -070082 case '{': return formToken(Token::l_brace, tokStart);
83 case '}': return formToken(Token::r_brace, tokStart);
Chris Lattner85ee1512018-07-25 11:15:20 -070084 case '[':
85 return formToken(Token::l_square, tokStart);
86 case ']':
87 return formToken(Token::r_square, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070088 case '<': return formToken(Token::less, tokStart);
89 case '>': return formToken(Token::greater, tokStart);
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -070090 case '=': return formToken(Token::equal, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070091
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -070092 case '+': return formToken(Token::plus, tokStart);
93 case '*': return formToken(Token::star, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070094 case '-':
95 if (*curPtr == '>') {
96 ++curPtr;
97 return formToken(Token::arrow, tokStart);
98 }
Uday Bondhugula015cbb12018-07-03 20:16:08 -070099 return formToken(Token::minus, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700100
101 case '?':
102 if (*curPtr == '?') {
103 ++curPtr;
104 return formToken(Token::questionquestion, tokStart);
105 }
106
107 return formToken(Token::question, tokStart);
108
Chris Lattner3e59f082018-07-14 23:06:24 -0700109 case '/':
110 if (*curPtr == '/')
111 return lexComment();
112 return emitError(tokStart, "unexpected character");
113
Uday Bondhugulabc535622018-08-07 14:24:38 -0700114 case '@':
115 if (*curPtr == '@') {
116 ++curPtr;
117 return lexDoubleAtIdentifier(tokStart);
118 }
119 return lexAtIdentifier(tokStart);
120
Chris Lattner78276e32018-07-07 15:48:26 -0700121 case '#':
122 LLVM_FALLTHROUGH;
123 case '%':
124 return lexPrefixedIdentifier(tokStart);
Chris Lattnered65a732018-06-28 20:45:33 -0700125 case '"': return lexString(tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700126
127 case '0': case '1': case '2': case '3': case '4':
128 case '5': case '6': case '7': case '8': case '9':
129 return lexNumber(tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -0700130 }
131}
132
133/// Lex a comment line, starting with a semicolon.
134///
135/// TODO: add a regex for comments here and to the spec.
136///
137Token Lexer::lexComment() {
Chris Lattner3e59f082018-07-14 23:06:24 -0700138 // Advance over the second '/' in a '//' comment.
139 assert(*curPtr == '/');
140 ++curPtr;
141
Chris Lattnere79379a2018-06-22 10:39:19 -0700142 while (true) {
143 switch (*curPtr++) {
144 case '\n':
145 case '\r':
146 // Newline is end of comment.
147 return lexToken();
148 case 0:
149 // If this is the end of the buffer, end the comment.
150 if (curPtr-1 == curBuffer.end()) {
151 --curPtr;
152 return lexToken();
153 }
154 LLVM_FALLTHROUGH;
155 default:
156 // Skip over other characters.
157 break;
158 }
159 }
160}
161
162/// Lex a bare identifier or keyword that starts with a letter.
163///
Jacques Pienaar4451c572018-07-31 15:40:09 -0700164/// bare-id ::= (letter|[_]) (letter|digit|[_$.])*
Chris Lattnerf958bbe2018-06-29 22:08:05 -0700165/// integer-type ::= `i[1-9][0-9]*`
Chris Lattnere79379a2018-06-22 10:39:19 -0700166///
167Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
Jacques Pienaar4451c572018-07-31 15:40:09 -0700168 // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
Jacques Pienaarc0d69302018-07-27 11:07:12 -0700169 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
Jacques Pienaar4451c572018-07-31 15:40:09 -0700170 *curPtr == '$' || *curPtr == '.')
Chris Lattnere79379a2018-06-22 10:39:19 -0700171 ++curPtr;
172
173 // Check to see if this identifier is a keyword.
174 StringRef spelling(tokStart, curPtr-tokStart);
175
Chris Lattnerf958bbe2018-06-29 22:08:05 -0700176 // Check for i123.
177 if (tokStart[0] == 'i') {
178 bool allDigits = true;
179 for (auto c : spelling.drop_front())
180 allDigits &= isdigit(c) != 0;
181 if (allDigits && spelling.size() != 1)
182 return Token(Token::inttype, spelling);
183 }
184
Chris Lattner8da0c282018-06-29 11:15:56 -0700185 Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
186#define TOK_KEYWORD(SPELLING) \
187 .Case(#SPELLING, Token::kw_##SPELLING)
188#include "TokenKinds.def"
Chris Lattnere79379a2018-06-22 10:39:19 -0700189 .Default(Token::bare_identifier);
190
191 return Token(kind, spelling);
192}
193
194/// Lex an '@foo' identifier.
195///
196/// function-id ::= `@` bare-id
197///
198Token Lexer::lexAtIdentifier(const char *tokStart) {
199 // These always start with a letter.
200 if (!isalpha(*curPtr++))
201 return emitError(curPtr-1, "expected letter in @ identifier");
202
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700203 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700204 ++curPtr;
205 return formToken(Token::at_identifier, tokStart);
206}
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700207
Uday Bondhugulabc535622018-08-07 14:24:38 -0700208/// Lex an '@@foo' identifier.
209///
210/// function-id ::= `@@` bare-id
211///
212Token Lexer::lexDoubleAtIdentifier(const char *tokStart) {
213 // These always start with a letter.
214 if (!isalpha(*curPtr++))
215 return emitError(curPtr - 1, "expected letter in @@ identifier");
216
217 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
218 ++curPtr;
219 return formToken(Token::double_at_identifier, tokStart);
220}
221
Chris Lattner78276e32018-07-07 15:48:26 -0700222/// Lex an identifier that starts with a prefix followed by suffix-id.
MLIR Teamf85a6262018-06-27 11:03:08 -0700223///
224/// affine-map-id ::= `#` suffix-id
Chris Lattner78276e32018-07-07 15:48:26 -0700225/// ssa-id ::= '%' suffix-id
MLIR Teamf85a6262018-06-27 11:03:08 -0700226/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
227///
Chris Lattner78276e32018-07-07 15:48:26 -0700228Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
229 Token::Kind kind;
230 StringRef errorKind;
231 switch (*tokStart) {
232 case '#':
233 kind = Token::hash_identifier;
234 errorKind = "invalid affine map name";
235 break;
236 case '%':
237 kind = Token::percent_identifier;
238 errorKind = "invalid SSA name";
239 break;
240 default:
241 llvm_unreachable("invalid caller");
242 }
243
MLIR Teamf85a6262018-06-27 11:03:08 -0700244 // Parse suffix-id.
245 if (isdigit(*curPtr)) {
246 // If suffix-id starts with a digit, the rest must be digits.
247 while (isdigit(*curPtr)) {
248 ++curPtr;
249 }
250 } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
251 do {
252 ++curPtr;
253 } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
254 } else {
Chris Lattner78276e32018-07-07 15:48:26 -0700255 return emitError(curPtr - 1, errorKind);
MLIR Teamf85a6262018-06-27 11:03:08 -0700256 }
Chris Lattner78276e32018-07-07 15:48:26 -0700257
258 return formToken(kind, tokStart);
MLIR Teamf85a6262018-06-27 11:03:08 -0700259}
260
Jacques Pienaar84491092018-07-31 17:15:15 -0700261/// Lex a number literal.
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700262///
263/// integer-literal ::= digit+ | `0x` hex_digit+
Jacques Pienaar84491092018-07-31 17:15:15 -0700264/// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700265///
266Token Lexer::lexNumber(const char *tokStart) {
267 assert(isdigit(curPtr[-1]));
268
269 // Handle the hexadecimal case.
270 if (curPtr[-1] == '0' && *curPtr == 'x') {
271 ++curPtr;
272
273 if (!isxdigit(*curPtr))
274 return emitError(curPtr, "expected hexadecimal digit");
275
276 while (isxdigit(*curPtr))
277 ++curPtr;
278
279 return formToken(Token::integer, tokStart);
280 }
281
282 // Handle the normal decimal case.
283 while (isdigit(*curPtr))
284 ++curPtr;
285
Jacques Pienaar84491092018-07-31 17:15:15 -0700286 if (*curPtr != '.')
287 return formToken(Token::integer, tokStart);
288 ++curPtr;
289
290 // Skip over [0-9]*([eE][-+]?[0-9]+)?
291 while (isdigit(*curPtr)) ++curPtr;
292
293 if (*curPtr == 'e' || *curPtr == 'E') {
294 if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
295 ((curPtr[1] == '-' || curPtr[1] == '+') &&
296 isdigit(static_cast<unsigned char>(curPtr[2])))) {
297 curPtr += 2;
298 while (isdigit(*curPtr)) ++curPtr;
299 }
300 }
301 return formToken(Token::floatliteral, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700302}
Chris Lattnered65a732018-06-28 20:45:33 -0700303
304/// Lex a string literal.
305///
306/// string-literal ::= '"' [^"\n\f\v\r]* '"'
307///
308/// TODO: define escaping rules.
309Token Lexer::lexString(const char *tokStart) {
310 assert(curPtr[-1] == '"');
311
312 while (1) {
313 switch (*curPtr++) {
314 case '"':
315 return formToken(Token::string, tokStart);
316 case '0':
317 // If this is a random nul character in the middle of a string, just
318 // include it. If it is the end of file, then it is an error.
319 if (curPtr-1 != curBuffer.end())
320 continue;
321 LLVM_FALLTHROUGH;
322 case '\n':
323 case '\v':
324 case '\f':
325 return emitError(curPtr-1, "expected '\"' in string literal");
326
327 default:
328 continue;
329 }
330 }
331}