blob: 011dfcbb4ed78e56bff933986f7768733e61d2a6 [file] [log] [blame]
Chris Lattnere79379a2018-06-22 10:39:19 -07001//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Copyright 2019 The MLIR Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16// =============================================================================
17//
18// This file implements the lexer for the MLIR textual form.
19//
20//===----------------------------------------------------------------------===//
21
22#include "Lexer.h"
23#include "llvm/Support/SourceMgr.h"
24using namespace mlir;
25using llvm::SMLoc;
26using llvm::SourceMgr;
27
MLIR Teamf85a6262018-06-27 11:03:08 -070028// Returns true if 'c' is an allowable puncuation character: [$._-]
29// Returns false otherwise.
30static bool isPunct(char c) {
31 return c == '$' || c == '.' || c == '_' || c == '-';
32}
33
Jacques Pienaar0bffd862018-07-11 13:26:23 -070034Lexer::Lexer(llvm::SourceMgr &sourceMgr, SMDiagnosticHandlerTy errorReporter)
Jacques Pienaar9c411be2018-06-24 19:17:35 -070035 : sourceMgr(sourceMgr), errorReporter(errorReporter) {
Chris Lattnere79379a2018-06-22 10:39:19 -070036 auto bufferID = sourceMgr.getMainFileID();
37 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
38 curPtr = curBuffer.begin();
39}
40
41/// emitError - Emit an error message and return an Token::error token.
42Token Lexer::emitError(const char *loc, const Twine &message) {
Jacques Pienaar9c411be2018-06-24 19:17:35 -070043 errorReporter(sourceMgr.GetMessage(SMLoc::getFromPointer(loc),
44 SourceMgr::DK_Error, message));
Chris Lattnere79379a2018-06-22 10:39:19 -070045 return formToken(Token::error, loc);
46}
47
48Token Lexer::lexToken() {
49 const char *tokStart = curPtr;
50
51 switch (*curPtr++) {
52 default:
53 // Handle bare identifiers.
54 if (isalpha(curPtr[-1]))
55 return lexBareIdentifierOrKeyword(tokStart);
56
57 // Unknown character, emit an error.
58 return emitError(tokStart, "unexpected character");
59
60 case 0:
61 // This may either be a nul character in the source file or may be the EOF
62 // marker that llvm::MemoryBuffer guarantees will be there.
63 if (curPtr-1 == curBuffer.end())
64 return formToken(Token::eof, tokStart);
65
66 LLVM_FALLTHROUGH;
67 case ' ':
68 case '\t':
69 case '\n':
70 case '\r':
71 // Ignore whitespace.
72 return lexToken();
73
Chris Lattner4c95a502018-06-23 16:03:42 -070074 case ':': return formToken(Token::colon, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070075 case ',': return formToken(Token::comma, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070076 case '(': return formToken(Token::l_paren, tokStart);
77 case ')': return formToken(Token::r_paren, tokStart);
Chris Lattner4c95a502018-06-23 16:03:42 -070078 case '{': return formToken(Token::l_brace, tokStart);
79 case '}': return formToken(Token::r_brace, tokStart);
Chris Lattner85ee1512018-07-25 11:15:20 -070080 case '[':
81 return formToken(Token::l_square, tokStart);
82 case ']':
83 return formToken(Token::r_square, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070084 case '<': return formToken(Token::less, tokStart);
85 case '>': return formToken(Token::greater, tokStart);
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -070086 case '=': return formToken(Token::equal, tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -070087
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -070088 case '+': return formToken(Token::plus, tokStart);
89 case '*': return formToken(Token::star, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070090 case '-':
91 if (*curPtr == '>') {
92 ++curPtr;
93 return formToken(Token::arrow, tokStart);
94 }
Uday Bondhugula015cbb12018-07-03 20:16:08 -070095 return formToken(Token::minus, tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -070096
97 case '?':
98 if (*curPtr == '?') {
99 ++curPtr;
100 return formToken(Token::questionquestion, tokStart);
101 }
102
103 return formToken(Token::question, tokStart);
104
Chris Lattner3e59f082018-07-14 23:06:24 -0700105 case '/':
106 if (*curPtr == '/')
107 return lexComment();
108 return emitError(tokStart, "unexpected character");
109
Chris Lattnere79379a2018-06-22 10:39:19 -0700110 case '@': return lexAtIdentifier(tokStart);
Chris Lattner78276e32018-07-07 15:48:26 -0700111 case '#':
112 LLVM_FALLTHROUGH;
113 case '%':
114 return lexPrefixedIdentifier(tokStart);
Chris Lattnered65a732018-06-28 20:45:33 -0700115 case '"': return lexString(tokStart);
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700116
117 case '0': case '1': case '2': case '3': case '4':
118 case '5': case '6': case '7': case '8': case '9':
119 return lexNumber(tokStart);
Chris Lattnere79379a2018-06-22 10:39:19 -0700120 }
121}
122
123/// Lex a comment line, starting with a semicolon.
124///
125/// TODO: add a regex for comments here and to the spec.
126///
127Token Lexer::lexComment() {
Chris Lattner3e59f082018-07-14 23:06:24 -0700128 // Advance over the second '/' in a '//' comment.
129 assert(*curPtr == '/');
130 ++curPtr;
131
Chris Lattnere79379a2018-06-22 10:39:19 -0700132 while (true) {
133 switch (*curPtr++) {
134 case '\n':
135 case '\r':
136 // Newline is end of comment.
137 return lexToken();
138 case 0:
139 // If this is the end of the buffer, end the comment.
140 if (curPtr-1 == curBuffer.end()) {
141 --curPtr;
142 return lexToken();
143 }
144 LLVM_FALLTHROUGH;
145 default:
146 // Skip over other characters.
147 break;
148 }
149 }
150}
151
152/// Lex a bare identifier or keyword that starts with a letter.
153///
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700154/// bare-id ::= letter (letter|digit|[_])*
Chris Lattnerf958bbe2018-06-29 22:08:05 -0700155/// integer-type ::= `i[1-9][0-9]*`
Chris Lattnere79379a2018-06-22 10:39:19 -0700156///
157Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700158 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
159 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700160 ++curPtr;
161
162 // Check to see if this identifier is a keyword.
163 StringRef spelling(tokStart, curPtr-tokStart);
164
Chris Lattnerf958bbe2018-06-29 22:08:05 -0700165 // Check for i123.
166 if (tokStart[0] == 'i') {
167 bool allDigits = true;
168 for (auto c : spelling.drop_front())
169 allDigits &= isdigit(c) != 0;
170 if (allDigits && spelling.size() != 1)
171 return Token(Token::inttype, spelling);
172 }
173
Chris Lattner8da0c282018-06-29 11:15:56 -0700174 Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
175#define TOK_KEYWORD(SPELLING) \
176 .Case(#SPELLING, Token::kw_##SPELLING)
177#include "TokenKinds.def"
Chris Lattnere79379a2018-06-22 10:39:19 -0700178 .Default(Token::bare_identifier);
179
180 return Token(kind, spelling);
181}
182
183/// Lex an '@foo' identifier.
184///
185/// function-id ::= `@` bare-id
186///
187Token Lexer::lexAtIdentifier(const char *tokStart) {
188 // These always start with a letter.
189 if (!isalpha(*curPtr++))
190 return emitError(curPtr-1, "expected letter in @ identifier");
191
Chris Lattnerf6d80a02018-06-24 11:18:29 -0700192 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_')
Chris Lattnere79379a2018-06-22 10:39:19 -0700193 ++curPtr;
194 return formToken(Token::at_identifier, tokStart);
195}
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700196
Chris Lattner78276e32018-07-07 15:48:26 -0700197/// Lex an identifier that starts with a prefix followed by suffix-id.
MLIR Teamf85a6262018-06-27 11:03:08 -0700198///
199/// affine-map-id ::= `#` suffix-id
Chris Lattner78276e32018-07-07 15:48:26 -0700200/// ssa-id ::= '%' suffix-id
MLIR Teamf85a6262018-06-27 11:03:08 -0700201/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
202///
Chris Lattner78276e32018-07-07 15:48:26 -0700203Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
204 Token::Kind kind;
205 StringRef errorKind;
206 switch (*tokStart) {
207 case '#':
208 kind = Token::hash_identifier;
209 errorKind = "invalid affine map name";
210 break;
211 case '%':
212 kind = Token::percent_identifier;
213 errorKind = "invalid SSA name";
214 break;
215 default:
216 llvm_unreachable("invalid caller");
217 }
218
MLIR Teamf85a6262018-06-27 11:03:08 -0700219 // Parse suffix-id.
220 if (isdigit(*curPtr)) {
221 // If suffix-id starts with a digit, the rest must be digits.
222 while (isdigit(*curPtr)) {
223 ++curPtr;
224 }
225 } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
226 do {
227 ++curPtr;
228 } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
229 } else {
Chris Lattner78276e32018-07-07 15:48:26 -0700230 return emitError(curPtr - 1, errorKind);
MLIR Teamf85a6262018-06-27 11:03:08 -0700231 }
Chris Lattner78276e32018-07-07 15:48:26 -0700232
233 return formToken(kind, tokStart);
MLIR Teamf85a6262018-06-27 11:03:08 -0700234}
235
Chris Lattnerbb8fafc2018-06-22 15:52:02 -0700236/// Lex an integer literal.
237///
238/// integer-literal ::= digit+ | `0x` hex_digit+
239///
240Token Lexer::lexNumber(const char *tokStart) {
241 assert(isdigit(curPtr[-1]));
242
243 // Handle the hexadecimal case.
244 if (curPtr[-1] == '0' && *curPtr == 'x') {
245 ++curPtr;
246
247 if (!isxdigit(*curPtr))
248 return emitError(curPtr, "expected hexadecimal digit");
249
250 while (isxdigit(*curPtr))
251 ++curPtr;
252
253 return formToken(Token::integer, tokStart);
254 }
255
256 // Handle the normal decimal case.
257 while (isdigit(*curPtr))
258 ++curPtr;
259
260 return formToken(Token::integer, tokStart);
261}
Chris Lattnered65a732018-06-28 20:45:33 -0700262
263/// Lex a string literal.
264///
265/// string-literal ::= '"' [^"\n\f\v\r]* '"'
266///
267/// TODO: define escaping rules.
268Token Lexer::lexString(const char *tokStart) {
269 assert(curPtr[-1] == '"');
270
271 while (1) {
272 switch (*curPtr++) {
273 case '"':
274 return formToken(Token::string, tokStart);
275 case '0':
276 // If this is a random nul character in the middle of a string, just
277 // include it. If it is the end of file, then it is an error.
278 if (curPtr-1 != curBuffer.end())
279 continue;
280 LLVM_FALLTHROUGH;
281 case '\n':
282 case '\v':
283 case '\f':
284 return emitError(curPtr-1, "expected '\"' in string literal");
285
286 default:
287 continue;
288 }
289 }
290}
Uday Bondhugulafaf37dd2018-06-29 18:09:29 -0700291