blob: 8e945604f1df9dc58296c2c20cef5fb979f2e94b [file] [log] [blame]
Chris Lattnere79379a2018-06-22 10:39:19 -07001//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Copyright 2019 The MLIR Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16// =============================================================================
17//
18// This file implements the lexer for the MLIR textual form.
19//
20//===----------------------------------------------------------------------===//
21
22#include "Lexer.h"
23#include "llvm/Support/SourceMgr.h"
24using namespace mlir;
25using llvm::SMLoc;
26using llvm::SourceMgr;
27
MLIR Teamf85a6262018-06-27 11:03:08 -070028// Returns true if 'c' is an allowable puncuation character: [$._-]
29// Returns false otherwise.
30static bool isPunct(char c) {
31 return c == '$' || c == '.' || c == '_' || c == '-';
32}
33
Jacques Pienaar9c411be2018-06-24 19:17:35 -070034Lexer::Lexer(llvm::SourceMgr &sourceMgr,
35 const SMDiagnosticHandlerTy &errorReporter)
36 : sourceMgr(sourceMgr), errorReporter(errorReporter) {
Chris Lattnere79379a2018-06-22 10:39:19 -070037 auto bufferID = sourceMgr.getMainFileID();
38 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
39 curPtr = curBuffer.begin();
40}
41
42/// emitError - Emit an error message and return an Token::error token.
43Token Lexer::emitError(const char *loc, const Twine &message) {
Jacques Pienaar9c411be2018-06-24 19:17:35 -070044 errorReporter(sourceMgr.GetMessage(SMLoc::getFromPointer(loc),
45 SourceMgr::DK_Error, message));
Chris Lattnere79379a2018-06-22 10:39:19 -070046 return formToken(Token::error, loc);
47}
48
49Token Lexer::lexToken() {
50 const char *tokStart = curPtr;
51
52 switch (*curPtr++) {
53 default:
54 // Handle bare identifiers.
55 if (isalpha(curPtr[-1]))
56 return lexBareIdentifierOrKeyword(tokStart);
57
58 // Unknown character, emit an error.
59 return emitError(tokStart, "unexpected character");
60
61 case 0:
62 // This may either be a nul character in the source file or may be the EOF
63 // marker that llvm::MemoryBuffer guarantees will be there.
64 if (curPtr-1 == curBuffer.end())
65 return formToken(Token::eof, tokStart);
66
67 LLVM_FALLTHROUGH;
68 case ' ':
69 case '\t':
70 case '\n':
71 case '\r':
72 // Ignore whitespace.
73 return lexToken();
74
Chris Lattner4c95a502018-06-23 16:03:42 -070075 case ':': return formToken(Token::colon, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070076 case ',': return formToken(Token::comma, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070077 case '(': return formToken(Token::l_paren, tokStart);
78 case ')': return formToken(Token::r_paren, tokStart);
Chris Lattner4c95a502018-06-23 16:03:42 -070079 case '{': return formToken(Token::l_brace, tokStart);
80 case '}': return formToken(Token::r_brace, tokStart);
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -070081 case '[': return formToken(Token::l_bracket, tokStart);
82 case ']': return formToken(Token::r_bracket, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070083 case '<': return formToken(Token::less, tokStart);
84 case '>': return formToken(Token::greater, tokStart);
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -070085 case '=': return formToken(Token::equal, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070086
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -070087 case '+': return formToken(Token::plus, tokStart);
88 case '*': return formToken(Token::star, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070089 case '-':
90 if (*curPtr == '>') {
91 ++curPtr;
92 return formToken(Token::arrow, tokStart);
93 }
Uday Bondhugula015cbb12018-07-03 20:16:08 -070094 return formToken(Token::minus, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070095
96 case '?':
97 if (*curPtr == '?') {
98 ++curPtr;
99 return formToken(Token::questionquestion, tokStart);
100 }
101
102 return formToken(Token::question, tokStart);
103
Chris Lattnere79379a2018-06-22 10:39:19 -0700104 case ';': return lexComment();
105 case '@': return lexAtIdentifier(tokStart);
Chris Lattner78276e32018-07-07 15:48:26 -0700106 case '#':
107 LLVM_FALLTHROUGH;
108 case '%':
109 return lexPrefixedIdentifier(tokStart);
Chris Lattnered65a732018-06-28 20:45:33 -0700110 case '"': return lexString(tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700111
112 case '0': case '1': case '2': case '3': case '4':
113 case '5': case '6': case '7': case '8': case '9':
114 return lexNumber(tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -0700115 }
116}
117
118/// Lex a comment line, starting with a semicolon.
119///
120/// TODO: add a regex for comments here and to the spec.
121///
122Token Lexer::lexComment() {
123 while (true) {
124 switch (*curPtr++) {
125 case '\n':
126 case '\r':
127 // Newline is end of comment.
128 return lexToken();
129 case 0:
130 // If this is the end of the buffer, end the comment.
131 if (curPtr-1 == curBuffer.end()) {
132 --curPtr;
133 return lexToken();
134 }
135 LLVM_FALLTHROUGH;
136 default:
137 // Skip over other characters.
138 break;
139 }
140 }
141}
142
143/// Lex a bare identifier or keyword that starts with a letter.
144///
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700145/// bare-id ::= letter (letter|digit|[_])*
Chris Lattnerf958bbe2018-06-29 22:08:05 -0700146/// integer-type ::= `i[1-9][0-9]*`
Chris Lattnere79379a2018-06-22 10:39:19 -0700147///
148Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700149 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
150 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700151 ++curPtr;
152
153 // Check to see if this identifier is a keyword.
154 StringRef spelling(tokStart, curPtr-tokStart);
155
Chris Lattnerf958bbe2018-06-29 22:08:05 -0700156 // Check for i123.
157 if (tokStart[0] == 'i') {
158 bool allDigits = true;
159 for (auto c : spelling.drop_front())
160 allDigits &= isdigit(c) != 0;
161 if (allDigits && spelling.size() != 1)
162 return Token(Token::inttype, spelling);
163 }
164
Chris Lattner8da0c282018-06-29 11:15:56 -0700165 Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
166#define TOK_KEYWORD(SPELLING) \
167 .Case(#SPELLING, Token::kw_##SPELLING)
168#include "TokenKinds.def"
Chris Lattnere79379a2018-06-22 10:39:19 -0700169 .Default(Token::bare_identifier);
170
171 return Token(kind, spelling);
172}
173
174/// Lex an '@foo' identifier.
175///
176/// function-id ::= `@` bare-id
177///
178Token Lexer::lexAtIdentifier(const char *tokStart) {
179 // These always start with a letter.
180 if (!isalpha(*curPtr++))
181 return emitError(curPtr-1, "expected letter in @ identifier");
182
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700183 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700184 ++curPtr;
185 return formToken(Token::at_identifier, tokStart);
186}
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700187
Chris Lattner78276e32018-07-07 15:48:26 -0700188/// Lex an identifier that starts with a prefix followed by suffix-id.
MLIR Teamf85a6262018-06-27 11:03:08 -0700189///
190/// affine-map-id ::= `#` suffix-id
Chris Lattner78276e32018-07-07 15:48:26 -0700191/// ssa-id ::= '%' suffix-id
MLIR Teamf85a6262018-06-27 11:03:08 -0700192/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
193///
Chris Lattner78276e32018-07-07 15:48:26 -0700194Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
195 Token::Kind kind;
196 StringRef errorKind;
197 switch (*tokStart) {
198 case '#':
199 kind = Token::hash_identifier;
200 errorKind = "invalid affine map name";
201 break;
202 case '%':
203 kind = Token::percent_identifier;
204 errorKind = "invalid SSA name";
205 break;
206 default:
207 llvm_unreachable("invalid caller");
208 }
209
MLIR Teamf85a6262018-06-27 11:03:08 -0700210 // Parse suffix-id.
211 if (isdigit(*curPtr)) {
212 // If suffix-id starts with a digit, the rest must be digits.
213 while (isdigit(*curPtr)) {
214 ++curPtr;
215 }
216 } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
217 do {
218 ++curPtr;
219 } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
220 } else {
Chris Lattner78276e32018-07-07 15:48:26 -0700221 return emitError(curPtr - 1, errorKind);
MLIR Teamf85a6262018-06-27 11:03:08 -0700222 }
Chris Lattner78276e32018-07-07 15:48:26 -0700223
224 return formToken(kind, tokStart);
MLIR Teamf85a6262018-06-27 11:03:08 -0700225}
226
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700227/// Lex an integer literal.
228///
229/// integer-literal ::= digit+ | `0x` hex_digit+
230///
231Token Lexer::lexNumber(const char *tokStart) {
232 assert(isdigit(curPtr[-1]));
233
234 // Handle the hexadecimal case.
235 if (curPtr[-1] == '0' && *curPtr == 'x') {
236 ++curPtr;
237
238 if (!isxdigit(*curPtr))
239 return emitError(curPtr, "expected hexadecimal digit");
240
241 while (isxdigit(*curPtr))
242 ++curPtr;
243
244 return formToken(Token::integer, tokStart);
245 }
246
247 // Handle the normal decimal case.
248 while (isdigit(*curPtr))
249 ++curPtr;
250
251 return formToken(Token::integer, tokStart);
252}
Chris Lattnered65a732018-06-28 20:45:33 -0700253
254/// Lex a string literal.
255///
256/// string-literal ::= '"' [^"\n\f\v\r]* '"'
257///
258/// TODO: define escaping rules.
259Token Lexer::lexString(const char *tokStart) {
260 assert(curPtr[-1] == '"');
261
262 while (1) {
263 switch (*curPtr++) {
264 case '"':
265 return formToken(Token::string, tokStart);
266 case '0':
267 // If this is a random nul character in the middle of a string, just
268 // include it. If it is the end of file, then it is an error.
269 if (curPtr-1 != curBuffer.end())
270 continue;
271 LLVM_FALLTHROUGH;
272 case '\n':
273 case '\v':
274 case '\f':
275 return emitError(curPtr-1, "expected '\"' in string literal");
276
277 default:
278 continue;
279 }
280 }
281}
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -0700282