Blame - lib/Parser/Lexer.cpp - platform/external/tensorflow

blob: 02db4db8a5997bd4febb19e2266745a118f38670 [file] [log] [blame]

Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	1	//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
				2	//
				3	// Copyright 2019 The MLIR Authors.
				4	//
				5	// Licensed under the Apache License, Version 2.0 (the "License");
				6	// you may not use this file except in compliance with the License.
				7	// You may obtain a copy of the License at
				8	//
				9	// http://www.apache.org/licenses/LICENSE-2.0
				10	//
				11	// Unless required by applicable law or agreed to in writing, software
				12	// distributed under the License is distributed on an "AS IS" BASIS,
				13	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	// See the License for the specific language governing permissions and
				15	// limitations under the License.
				16	// =============================================================================
				17	//
				18	// This file implements the lexer for the MLIR textual form.
				19	//
				20	//===----------------------------------------------------------------------===//
				21
				22	#include "Lexer.h"
				23	#include "llvm/Support/SourceMgr.h"
				24	using namespace mlir;
				25	using llvm::SMLoc;
				26	using llvm::SourceMgr;
				27
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	28	// Returns true if 'c' is an allowable puncuation character: [$._-]
				29	// Returns false otherwise.
				30	static bool isPunct(char c) {
				31	return c == '$' \|\| c == '.' \|\| c == '_' \|\| c == '-';
				32	}
				33
Jacques Pienaar	0bffd86	2018-07-11 13:26:23 -0700	[diff] [blame]	34	Lexer::Lexer(llvm::SourceMgr &sourceMgr, SMDiagnosticHandlerTy errorReporter)
Jacques Pienaar	9c411be	2018-06-24 19:17:35 -0700	[diff] [blame]	35	: sourceMgr(sourceMgr), errorReporter(errorReporter) {
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	36	auto bufferID = sourceMgr.getMainFileID();
				37	curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
				38	curPtr = curBuffer.begin();
				39	}
				40
				41	/// emitError - Emit an error message and return an Token::error token.
				42	Token Lexer::emitError(const char *loc, const Twine &message) {
Jacques Pienaar	9c411be	2018-06-24 19:17:35 -0700	[diff] [blame]	43	errorReporter(sourceMgr.GetMessage(SMLoc::getFromPointer(loc),
				44	SourceMgr::DK_Error, message));
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	45	return formToken(Token::error, loc);
				46	}
				47
				48	Token Lexer::lexToken() {
				49	const char *tokStart = curPtr;
				50
				51	switch (*curPtr++) {
				52	default:
				53	// Handle bare identifiers.
				54	if (isalpha(curPtr[-1]))
				55	return lexBareIdentifierOrKeyword(tokStart);
				56
				57	// Unknown character, emit an error.
				58	return emitError(tokStart, "unexpected character");
				59
Chris Lattner	ee0c2ae	2018-07-29 12:37:35 -0700	[diff] [blame]	60	case '_':
				61	// Handle bare identifiers.
				62	return lexBareIdentifierOrKeyword(tokStart);
				63
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	64	case 0:
				65	// This may either be a nul character in the source file or may be the EOF
				66	// marker that llvm::MemoryBuffer guarantees will be there.
				67	if (curPtr-1 == curBuffer.end())
				68	return formToken(Token::eof, tokStart);
				69
				70	LLVM_FALLTHROUGH;
				71	case ' ':
				72	case '\t':
				73	case '\n':
				74	case '\r':
				75	// Ignore whitespace.
				76	return lexToken();
				77
Chris Lattner	4c95a50	2018-06-23 16:03:42 -0700	[diff] [blame]	78	case ':': return formToken(Token::colon, tokStart);
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	79	case ',': return formToken(Token::comma, tokStart);
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	80	case '(': return formToken(Token::l_paren, tokStart);
				81	case ')': return formToken(Token::r_paren, tokStart);
Chris Lattner	4c95a50	2018-06-23 16:03:42 -0700	[diff] [blame]	82	case '{': return formToken(Token::l_brace, tokStart);
				83	case '}': return formToken(Token::r_brace, tokStart);
Chris Lattner	85ee151	2018-07-25 11:15:20 -0700	[diff] [blame]	84	case '[':
				85	return formToken(Token::l_square, tokStart);
				86	case ']':
				87	return formToken(Token::r_square, tokStart);
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	88	case '<': return formToken(Token::less, tokStart);
				89	case '>': return formToken(Token::greater, tokStart);
Uday Bondhugula	faf37dd	2018-06-29 18:09:29 -0700	[diff] [blame]	90	case '=': return formToken(Token::equal, tokStart);
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	91
Uday Bondhugula	faf37dd	2018-06-29 18:09:29 -0700	[diff] [blame]	92	case '+': return formToken(Token::plus, tokStart);
				93	case '*': return formToken(Token::star, tokStart);
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	94	case '-':
				95	if (*curPtr == '>') {
				96	++curPtr;
				97	return formToken(Token::arrow, tokStart);
				98	}
Uday Bondhugula	015cbb1	2018-07-03 20:16:08 -0700	[diff] [blame]	99	return formToken(Token::minus, tokStart);
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	100
				101	case '?':
				102	if (*curPtr == '?') {
				103	++curPtr;
				104	return formToken(Token::questionquestion, tokStart);
				105	}
				106
				107	return formToken(Token::question, tokStart);
				108
Chris Lattner	3e59f08	2018-07-14 23:06:24 -0700	[diff] [blame]	109	case '/':
				110	if (*curPtr == '/')
				111	return lexComment();
				112	return emitError(tokStart, "unexpected character");
				113
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	114	case '@': return lexAtIdentifier(tokStart);
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	115	case '#':
				116	LLVM_FALLTHROUGH;
				117	case '%':
				118	return lexPrefixedIdentifier(tokStart);
Chris Lattner	ed65a73	2018-06-28 20:45:33 -0700	[diff] [blame]	119	case '"': return lexString(tokStart);
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	120
				121	case '0': case '1': case '2': case '3': case '4':
				122	case '5': case '6': case '7': case '8': case '9':
				123	return lexNumber(tokStart);
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	124	}
				125	}
				126
				127	/// Lex a comment line, starting with a semicolon.
				128	///
				129	/// TODO: add a regex for comments here and to the spec.
				130	///
				131	Token Lexer::lexComment() {
Chris Lattner	3e59f08	2018-07-14 23:06:24 -0700	[diff] [blame]	132	// Advance over the second '/' in a '//' comment.
				133	assert(*curPtr == '/');
				134	++curPtr;
				135
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	136	while (true) {
				137	switch (*curPtr++) {
				138	case '\n':
				139	case '\r':
				140	// Newline is end of comment.
				141	return lexToken();
				142	case 0:
				143	// If this is the end of the buffer, end the comment.
				144	if (curPtr-1 == curBuffer.end()) {
				145	--curPtr;
				146	return lexToken();
				147	}
				148	LLVM_FALLTHROUGH;
				149	default:
				150	// Skip over other characters.
				151	break;
				152	}
				153	}
				154	}
				155
				156	/// Lex a bare identifier or keyword that starts with a letter.
				157	///
Jacques Pienaar	4451c57	2018-07-31 15:40:09 -0700	[diff] [blame]	158	/// bare-id ::= (letter\|[_]) (letter\|digit\|[_$.])*
Chris Lattner	f958bbe	2018-06-29 22:08:05 -0700	[diff] [blame]	159	/// integer-type ::= `i[1-9][0-9]*`
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	160	///
				161	Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
Jacques Pienaar	4451c57	2018-07-31 15:40:09 -0700	[diff] [blame]	162	// Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
Jacques Pienaar	c0d6930	2018-07-27 11:07:12 -0700	[diff] [blame]	163	while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| *curPtr == '_' \|\|
Jacques Pienaar	4451c57	2018-07-31 15:40:09 -0700	[diff] [blame]	164	curPtr == '$' \|\| curPtr == '.')
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	165	++curPtr;
				166
				167	// Check to see if this identifier is a keyword.
				168	StringRef spelling(tokStart, curPtr-tokStart);
				169
Chris Lattner	f958bbe	2018-06-29 22:08:05 -0700	[diff] [blame]	170	// Check for i123.
				171	if (tokStart[0] == 'i') {
				172	bool allDigits = true;
				173	for (auto c : spelling.drop_front())
				174	allDigits &= isdigit(c) != 0;
				175	if (allDigits && spelling.size() != 1)
				176	return Token(Token::inttype, spelling);
				177	}
				178
Chris Lattner	8da0c28	2018-06-29 11:15:56 -0700	[diff] [blame]	179	Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
				180	#define TOK_KEYWORD(SPELLING) \
				181	.Case(#SPELLING, Token::kw_##SPELLING)
				182	#include "TokenKinds.def"
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	183	.Default(Token::bare_identifier);
				184
				185	return Token(kind, spelling);
				186	}
				187
				188	/// Lex an '@foo' identifier.
				189	///
				190	/// function-id ::= `@` bare-id
				191	///
				192	Token Lexer::lexAtIdentifier(const char *tokStart) {
				193	// These always start with a letter.
				194	if (!isalpha(*curPtr++))
				195	return emitError(curPtr-1, "expected letter in @ identifier");
				196
Chris Lattner	f6d80a0	2018-06-24 11:18:29 -0700	[diff] [blame]	197	while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| *curPtr == '_')
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	198	++curPtr;
				199	return formToken(Token::at_identifier, tokStart);
				200	}
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	201
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	202	/// Lex an identifier that starts with a prefix followed by suffix-id.
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	203	///
				204	/// affine-map-id ::= `#` suffix-id
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	205	/// ssa-id ::= '%' suffix-id
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	206	/// suffix-id ::= digit+ \| (letter\|id-punct) (letter\|id-punct\|digit)*
				207	///
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	208	Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
				209	Token::Kind kind;
				210	StringRef errorKind;
				211	switch (*tokStart) {
				212	case '#':
				213	kind = Token::hash_identifier;
				214	errorKind = "invalid affine map name";
				215	break;
				216	case '%':
				217	kind = Token::percent_identifier;
				218	errorKind = "invalid SSA name";
				219	break;
				220	default:
				221	llvm_unreachable("invalid caller");
				222	}
				223
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	224	// Parse suffix-id.
				225	if (isdigit(*curPtr)) {
				226	// If suffix-id starts with a digit, the rest must be digits.
				227	while (isdigit(*curPtr)) {
				228	++curPtr;
				229	}
				230	} else if (isalpha(curPtr) \|\| isPunct(curPtr)) {
				231	do {
				232	++curPtr;
				233	} while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| isPunct(*curPtr));
				234	} else {
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	235	return emitError(curPtr - 1, errorKind);
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	236	}
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	237
				238	return formToken(kind, tokStart);
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	239	}
				240
Jacques Pienaar	8449109	2018-07-31 17:15:15 -0700	[diff] [blame^]	241	/// Lex a number literal.
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	242	///
				243	/// integer-literal ::= digit+ \| `0x` hex_digit+
Jacques Pienaar	8449109	2018-07-31 17:15:15 -0700	[diff] [blame^]	244	/// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	245	///
				246	Token Lexer::lexNumber(const char *tokStart) {
				247	assert(isdigit(curPtr[-1]));
				248
				249	// Handle the hexadecimal case.
				250	if (curPtr[-1] == '0' && *curPtr == 'x') {
				251	++curPtr;
				252
				253	if (!isxdigit(*curPtr))
				254	return emitError(curPtr, "expected hexadecimal digit");
				255
				256	while (isxdigit(*curPtr))
				257	++curPtr;
				258
				259	return formToken(Token::integer, tokStart);
				260	}
				261
				262	// Handle the normal decimal case.
				263	while (isdigit(*curPtr))
				264	++curPtr;
				265
Jacques Pienaar	8449109	2018-07-31 17:15:15 -0700	[diff] [blame^]	266	if (*curPtr != '.')
				267	return formToken(Token::integer, tokStart);
				268	++curPtr;
				269
				270	// Skip over [0-9]*([eE][-+]?[0-9]+)?
				271	while (isdigit(*curPtr)) ++curPtr;
				272
				273	if (curPtr == 'e' \|\| curPtr == 'E') {
				274	if (isdigit(static_cast<unsigned char>(curPtr[1])) \|\|
				275	((curPtr[1] == '-' \|\| curPtr[1] == '+') &&
				276	isdigit(static_cast<unsigned char>(curPtr[2])))) {
				277	curPtr += 2;
				278	while (isdigit(*curPtr)) ++curPtr;
				279	}
				280	}
				281	return formToken(Token::floatliteral, tokStart);
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	282	}
Chris Lattner	ed65a73	2018-06-28 20:45:33 -0700	[diff] [blame]	283
				284	/// Lex a string literal.
				285	///
				286	/// string-literal ::= '"' [^"\n\f\v\r]* '"'
				287	///
				288	/// TODO: define escaping rules.
				289	Token Lexer::lexString(const char *tokStart) {
				290	assert(curPtr[-1] == '"');
				291
				292	while (1) {
				293	switch (*curPtr++) {
				294	case '"':
				295	return formToken(Token::string, tokStart);
				296	case '0':
				297	// If this is a random nul character in the middle of a string, just
				298	// include it. If it is the end of file, then it is an error.
				299	if (curPtr-1 != curBuffer.end())
				300	continue;
				301	LLVM_FALLTHROUGH;
				302	case '\n':
				303	case '\v':
				304	case '\f':
				305	return emitError(curPtr-1, "expected '\"' in string literal");
				306
				307	default:
				308	continue;
				309	}
				310	}
				311	}