blob: 5958658b797a575aa21417195c7eedcbf25c8dbf [file] [log] [blame]
Chris Lattnere79379a2018-06-22 10:39:19 -07001//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Copyright 2019 The MLIR Authors.
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16// =============================================================================
17//
18// This file implements the lexer for the MLIR textual form.
19//
20//===----------------------------------------------------------------------===//
21
22#include "Lexer.h"
23#include "llvm/Support/SourceMgr.h"
24using namespace mlir;
25using llvm::SMLoc;
26using llvm::SourceMgr;
27
28Lexer::Lexer(llvm::SourceMgr &sourceMgr) : sourceMgr(sourceMgr) {
29 auto bufferID = sourceMgr.getMainFileID();
30 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
31 curPtr = curBuffer.begin();
32}
33
34/// emitError - Emit an error message and return an Token::error token.
35Token Lexer::emitError(const char *loc, const Twine &message) {
36 // TODO(clattner): If/when we want to implement a -verify mode, this will need
37 // to package up errors into SMDiagnostic and report them.
38 sourceMgr.PrintMessage(SMLoc::getFromPointer(loc), SourceMgr::DK_Error,
39 message);
40 return formToken(Token::error, loc);
41}
42
43Token Lexer::lexToken() {
44 const char *tokStart = curPtr;
45
46 switch (*curPtr++) {
47 default:
48 // Handle bare identifiers.
49 if (isalpha(curPtr[-1]))
50 return lexBareIdentifierOrKeyword(tokStart);
51
52 // Unknown character, emit an error.
53 return emitError(tokStart, "unexpected character");
54
55 case 0:
56 // This may either be a nul character in the source file or may be the EOF
57 // marker that llvm::MemoryBuffer guarantees will be there.
58 if (curPtr-1 == curBuffer.end())
59 return formToken(Token::eof, tokStart);
60
61 LLVM_FALLTHROUGH;
62 case ' ':
63 case '\t':
64 case '\n':
65 case '\r':
66 // Ignore whitespace.
67 return lexToken();
68
69 case '(': return formToken(Token::l_paren, tokStart);
70 case ')': return formToken(Token::r_paren, tokStart);
71 case '<': return formToken(Token::less, tokStart);
72 case '>': return formToken(Token::greater, tokStart);
73
74 case ';': return lexComment();
75 case '@': return lexAtIdentifier(tokStart);
76 }
77}
78
79/// Lex a comment line, starting with a semicolon.
80///
81/// TODO: add a regex for comments here and to the spec.
82///
83Token Lexer::lexComment() {
84 while (true) {
85 switch (*curPtr++) {
86 case '\n':
87 case '\r':
88 // Newline is end of comment.
89 return lexToken();
90 case 0:
91 // If this is the end of the buffer, end the comment.
92 if (curPtr-1 == curBuffer.end()) {
93 --curPtr;
94 return lexToken();
95 }
96 LLVM_FALLTHROUGH;
97 default:
98 // Skip over other characters.
99 break;
100 }
101 }
102}
103
104/// Lex a bare identifier or keyword that starts with a letter.
105///
106/// bare-id ::= letter (letter|digit)*
107///
108Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
109 // Match the rest of the identifier regex: [0-9a-zA-Z]*
110 while (isalpha(*curPtr) || isdigit(*curPtr))
111 ++curPtr;
112
113 // Check to see if this identifier is a keyword.
114 StringRef spelling(tokStart, curPtr-tokStart);
115
116 Token::TokenKind kind = llvm::StringSwitch<Token::TokenKind>(spelling)
117 .Case("cfgfunc", Token::kw_cfgfunc)
118 .Case("extfunc", Token::kw_extfunc)
119 .Case("mlfunc", Token::kw_mlfunc)
120 .Default(Token::bare_identifier);
121
122 return Token(kind, spelling);
123}
124
125/// Lex an '@foo' identifier.
126///
127/// function-id ::= `@` bare-id
128///
129Token Lexer::lexAtIdentifier(const char *tokStart) {
130 // These always start with a letter.
131 if (!isalpha(*curPtr++))
132 return emitError(curPtr-1, "expected letter in @ identifier");
133
134 while (isalpha(*curPtr) || isdigit(*curPtr))
135 ++curPtr;
136 return formToken(Token::at_identifier, tokStart);
137}