Blame - lib/Parser/Lexer.cpp - platform/external/tensorflow

blob: b4f8e1db6b530ee523cf4027b2393e5fa80a9f9e [file] [log] [blame]

Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	1	//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
				2	//
				3	// Copyright 2019 The MLIR Authors.
				4	//
				5	// Licensed under the Apache License, Version 2.0 (the "License");
				6	// you may not use this file except in compliance with the License.
				7	// You may obtain a copy of the License at
				8	//
				9	// http://www.apache.org/licenses/LICENSE-2.0
				10	//
				11	// Unless required by applicable law or agreed to in writing, software
				12	// distributed under the License is distributed on an "AS IS" BASIS,
				13	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	// See the License for the specific language governing permissions and
				15	// limitations under the License.
				16	// =============================================================================
				17	//
				18	// This file implements the lexer for the MLIR textual form.
				19	//
				20	//===----------------------------------------------------------------------===//
				21
				22	#include "Lexer.h"
Chris Lattner	7879f84	2018-09-02 22:01:45 -0700	[diff] [blame^]	23	#include "mlir/IR/Location.h"
				24	#include "mlir/IR/MLIRContext.h"
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	25	#include "llvm/Support/SourceMgr.h"
				26	using namespace mlir;
				27	using llvm::SMLoc;
				28	using llvm::SourceMgr;
				29
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	30	// Returns true if 'c' is an allowable puncuation character: [$._-]
				31	// Returns false otherwise.
				32	static bool isPunct(char c) {
				33	return c == '$' \|\| c == '.' \|\| c == '_' \|\| c == '-';
				34	}
				35
Chris Lattner	7879f84	2018-09-02 22:01:45 -0700	[diff] [blame^]	36	Lexer::Lexer(llvm::SourceMgr &sourceMgr, MLIRContext *context)
				37	: sourceMgr(sourceMgr), context(context) {
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	38	auto bufferID = sourceMgr.getMainFileID();
				39	curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
				40	curPtr = curBuffer.begin();
				41	}
				42
Chris Lattner	7879f84	2018-09-02 22:01:45 -0700	[diff] [blame^]	43	/// Encode the specified source location information into an attribute for
				44	/// attachment to the IR.
				45	Location *Lexer::getEncodedSourceLocation(llvm::SMLoc loc) {
				46	auto &sourceMgr = getSourceMgr();
				47	unsigned mainFileID = sourceMgr.getMainFileID();
				48	auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID);
				49	auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);
				50	auto filename = UniquedFilename::get(buffer->getBufferIdentifier(), context);
				51
				52	return FileLineColLoc::get(filename, lineAndColumn.first,
				53	lineAndColumn.second, context);
				54	}
				55
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	56	/// emitError - Emit an error message and return an Token::error token.
				57	Token Lexer::emitError(const char *loc, const Twine &message) {
Chris Lattner	7879f84	2018-09-02 22:01:45 -0700	[diff] [blame^]	58	context->emitDiagnostic(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
				59	message, MLIRContext::DiagnosticKind::Error);
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	60	return formToken(Token::error, loc);
				61	}
				62
				63	Token Lexer::lexToken() {
				64	const char *tokStart = curPtr;
				65
				66	switch (*curPtr++) {
				67	default:
				68	// Handle bare identifiers.
				69	if (isalpha(curPtr[-1]))
				70	return lexBareIdentifierOrKeyword(tokStart);
				71
				72	// Unknown character, emit an error.
				73	return emitError(tokStart, "unexpected character");
				74
Chris Lattner	ee0c2ae	2018-07-29 12:37:35 -0700	[diff] [blame]	75	case '_':
				76	// Handle bare identifiers.
				77	return lexBareIdentifierOrKeyword(tokStart);
				78
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	79	case 0:
				80	// This may either be a nul character in the source file or may be the EOF
				81	// marker that llvm::MemoryBuffer guarantees will be there.
				82	if (curPtr-1 == curBuffer.end())
				83	return formToken(Token::eof, tokStart);
				84
				85	LLVM_FALLTHROUGH;
				86	case ' ':
				87	case '\t':
				88	case '\n':
				89	case '\r':
				90	// Ignore whitespace.
				91	return lexToken();
				92
Chris Lattner	4c95a50	2018-06-23 16:03:42 -0700	[diff] [blame]	93	case ':': return formToken(Token::colon, tokStart);
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	94	case ',': return formToken(Token::comma, tokStart);
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	95	case '(': return formToken(Token::l_paren, tokStart);
				96	case ')': return formToken(Token::r_paren, tokStart);
Chris Lattner	4c95a50	2018-06-23 16:03:42 -0700	[diff] [blame]	97	case '{': return formToken(Token::l_brace, tokStart);
				98	case '}': return formToken(Token::r_brace, tokStart);
Chris Lattner	85ee151	2018-07-25 11:15:20 -0700	[diff] [blame]	99	case '[':
				100	return formToken(Token::l_square, tokStart);
				101	case ']':
				102	return formToken(Token::r_square, tokStart);
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	103	case '<': return formToken(Token::less, tokStart);
				104	case '>': return formToken(Token::greater, tokStart);
Uday Bondhugula	faf37dd	2018-06-29 18:09:29 -0700	[diff] [blame]	105	case '=': return formToken(Token::equal, tokStart);
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	106
Uday Bondhugula	faf37dd	2018-06-29 18:09:29 -0700	[diff] [blame]	107	case '+': return formToken(Token::plus, tokStart);
				108	case '*': return formToken(Token::star, tokStart);
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	109	case '-':
				110	if (*curPtr == '>') {
				111	++curPtr;
				112	return formToken(Token::arrow, tokStart);
				113	}
Uday Bondhugula	015cbb1	2018-07-03 20:16:08 -0700	[diff] [blame]	114	return formToken(Token::minus, tokStart);
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	115
				116	case '?':
				117	if (*curPtr == '?') {
				118	++curPtr;
				119	return formToken(Token::questionquestion, tokStart);
				120	}
				121
				122	return formToken(Token::question, tokStart);
				123
Chris Lattner	3e59f08	2018-07-14 23:06:24 -0700	[diff] [blame]	124	case '/':
				125	if (*curPtr == '/')
				126	return lexComment();
				127	return emitError(tokStart, "unexpected character");
				128
Uday Bondhugula	bc53562	2018-08-07 14:24:38 -0700	[diff] [blame]	129	case '@':
				130	if (*curPtr == '@') {
				131	++curPtr;
				132	return lexDoubleAtIdentifier(tokStart);
				133	}
				134	return lexAtIdentifier(tokStart);
				135
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	136	case '#':
				137	LLVM_FALLTHROUGH;
				138	case '%':
				139	return lexPrefixedIdentifier(tokStart);
Chris Lattner	ed65a73	2018-06-28 20:45:33 -0700	[diff] [blame]	140	case '"': return lexString(tokStart);
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	141
				142	case '0': case '1': case '2': case '3': case '4':
				143	case '5': case '6': case '7': case '8': case '9':
				144	return lexNumber(tokStart);
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	145	}
				146	}
				147
				148	/// Lex a comment line, starting with a semicolon.
				149	///
				150	/// TODO: add a regex for comments here and to the spec.
				151	///
				152	Token Lexer::lexComment() {
Chris Lattner	3e59f08	2018-07-14 23:06:24 -0700	[diff] [blame]	153	// Advance over the second '/' in a '//' comment.
				154	assert(*curPtr == '/');
				155	++curPtr;
				156
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	157	while (true) {
				158	switch (*curPtr++) {
				159	case '\n':
				160	case '\r':
				161	// Newline is end of comment.
				162	return lexToken();
				163	case 0:
				164	// If this is the end of the buffer, end the comment.
				165	if (curPtr-1 == curBuffer.end()) {
				166	--curPtr;
				167	return lexToken();
				168	}
				169	LLVM_FALLTHROUGH;
				170	default:
				171	// Skip over other characters.
				172	break;
				173	}
				174	}
				175	}
				176
				177	/// Lex a bare identifier or keyword that starts with a letter.
				178	///
Jacques Pienaar	4451c57	2018-07-31 15:40:09 -0700	[diff] [blame]	179	/// bare-id ::= (letter\|[_]) (letter\|digit\|[_$.])*
Chris Lattner	f958bbe	2018-06-29 22:08:05 -0700	[diff] [blame]	180	/// integer-type ::= `i[1-9][0-9]*`
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	181	///
				182	Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
Jacques Pienaar	4451c57	2018-07-31 15:40:09 -0700	[diff] [blame]	183	// Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
Jacques Pienaar	c0d6930	2018-07-27 11:07:12 -0700	[diff] [blame]	184	while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| *curPtr == '_' \|\|
Jacques Pienaar	4451c57	2018-07-31 15:40:09 -0700	[diff] [blame]	185	curPtr == '$' \|\| curPtr == '.')
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	186	++curPtr;
				187
				188	// Check to see if this identifier is a keyword.
				189	StringRef spelling(tokStart, curPtr-tokStart);
				190
Chris Lattner	f958bbe	2018-06-29 22:08:05 -0700	[diff] [blame]	191	// Check for i123.
				192	if (tokStart[0] == 'i') {
				193	bool allDigits = true;
				194	for (auto c : spelling.drop_front())
				195	allDigits &= isdigit(c) != 0;
				196	if (allDigits && spelling.size() != 1)
				197	return Token(Token::inttype, spelling);
				198	}
				199
Chris Lattner	8da0c28	2018-06-29 11:15:56 -0700	[diff] [blame]	200	Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
				201	#define TOK_KEYWORD(SPELLING) \
				202	.Case(#SPELLING, Token::kw_##SPELLING)
				203	#include "TokenKinds.def"
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	204	.Default(Token::bare_identifier);
				205
				206	return Token(kind, spelling);
				207	}
				208
				209	/// Lex an '@foo' identifier.
				210	///
				211	/// function-id ::= `@` bare-id
				212	///
				213	Token Lexer::lexAtIdentifier(const char *tokStart) {
				214	// These always start with a letter.
				215	if (!isalpha(*curPtr++))
				216	return emitError(curPtr-1, "expected letter in @ identifier");
				217
Chris Lattner	f6d80a0	2018-06-24 11:18:29 -0700	[diff] [blame]	218	while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| *curPtr == '_')
Chris Lattner	e79379a	2018-06-22 10:39:19 -0700	[diff] [blame]	219	++curPtr;
				220	return formToken(Token::at_identifier, tokStart);
				221	}
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	222
Uday Bondhugula	bc53562	2018-08-07 14:24:38 -0700	[diff] [blame]	223	/// Lex an '@@foo' identifier.
				224	///
				225	/// function-id ::= `@@` bare-id
				226	///
				227	Token Lexer::lexDoubleAtIdentifier(const char *tokStart) {
				228	// These always start with a letter.
				229	if (!isalpha(*curPtr++))
				230	return emitError(curPtr - 1, "expected letter in @@ identifier");
				231
				232	while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| *curPtr == '_')
				233	++curPtr;
				234	return formToken(Token::double_at_identifier, tokStart);
				235	}
				236
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	237	/// Lex an identifier that starts with a prefix followed by suffix-id.
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	238	///
				239	/// affine-map-id ::= `#` suffix-id
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	240	/// ssa-id ::= '%' suffix-id
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	241	/// suffix-id ::= digit+ \| (letter\|id-punct) (letter\|id-punct\|digit)*
				242	///
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	243	Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
				244	Token::Kind kind;
				245	StringRef errorKind;
				246	switch (*tokStart) {
				247	case '#':
				248	kind = Token::hash_identifier;
				249	errorKind = "invalid affine map name";
				250	break;
				251	case '%':
				252	kind = Token::percent_identifier;
				253	errorKind = "invalid SSA name";
				254	break;
				255	default:
				256	llvm_unreachable("invalid caller");
				257	}
				258
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	259	// Parse suffix-id.
				260	if (isdigit(*curPtr)) {
				261	// If suffix-id starts with a digit, the rest must be digits.
				262	while (isdigit(*curPtr)) {
				263	++curPtr;
				264	}
				265	} else if (isalpha(curPtr) \|\| isPunct(curPtr)) {
				266	do {
				267	++curPtr;
				268	} while (isalpha(curPtr) \|\| isdigit(curPtr) \|\| isPunct(*curPtr));
				269	} else {
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	270	return emitError(curPtr - 1, errorKind);
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	271	}
Chris Lattner	78276e3	2018-07-07 15:48:26 -0700	[diff] [blame]	272
				273	return formToken(kind, tokStart);
MLIR Team	f85a626	2018-06-27 11:03:08 -0700	[diff] [blame]	274	}
				275
Jacques Pienaar	8449109	2018-07-31 17:15:15 -0700	[diff] [blame]	276	/// Lex a number literal.
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	277	///
				278	/// integer-literal ::= digit+ \| `0x` hex_digit+
Jacques Pienaar	8449109	2018-07-31 17:15:15 -0700	[diff] [blame]	279	/// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	280	///
				281	Token Lexer::lexNumber(const char *tokStart) {
				282	assert(isdigit(curPtr[-1]));
				283
				284	// Handle the hexadecimal case.
				285	if (curPtr[-1] == '0' && *curPtr == 'x') {
				286	++curPtr;
				287
				288	if (!isxdigit(*curPtr))
				289	return emitError(curPtr, "expected hexadecimal digit");
				290
				291	while (isxdigit(*curPtr))
				292	++curPtr;
				293
				294	return formToken(Token::integer, tokStart);
				295	}
				296
				297	// Handle the normal decimal case.
				298	while (isdigit(*curPtr))
				299	++curPtr;
				300
Jacques Pienaar	8449109	2018-07-31 17:15:15 -0700	[diff] [blame]	301	if (*curPtr != '.')
				302	return formToken(Token::integer, tokStart);
				303	++curPtr;
				304
				305	// Skip over [0-9]*([eE][-+]?[0-9]+)?
				306	while (isdigit(*curPtr)) ++curPtr;
				307
				308	if (curPtr == 'e' \|\| curPtr == 'E') {
				309	if (isdigit(static_cast<unsigned char>(curPtr[1])) \|\|
				310	((curPtr[1] == '-' \|\| curPtr[1] == '+') &&
				311	isdigit(static_cast<unsigned char>(curPtr[2])))) {
				312	curPtr += 2;
				313	while (isdigit(*curPtr)) ++curPtr;
				314	}
				315	}
				316	return formToken(Token::floatliteral, tokStart);
Chris Lattner	bb8fafc	2018-06-22 15:52:02 -0700	[diff] [blame]	317	}
Chris Lattner	ed65a73	2018-06-28 20:45:33 -0700	[diff] [blame]	318
				319	/// Lex a string literal.
				320	///
				321	/// string-literal ::= '"' [^"\n\f\v\r]* '"'
				322	///
				323	/// TODO: define escaping rules.
				324	Token Lexer::lexString(const char *tokStart) {
				325	assert(curPtr[-1] == '"');
				326
				327	while (1) {
				328	switch (*curPtr++) {
				329	case '"':
				330	return formToken(Token::string, tokStart);
				331	case '0':
				332	// If this is a random nul character in the middle of a string, just
				333	// include it. If it is the end of file, then it is an error.
				334	if (curPtr-1 != curBuffer.end())
				335	continue;
				336	LLVM_FALLTHROUGH;
				337	case '\n':
				338	case '\v':
				339	case '\f':
				340	return emitError(curPtr-1, "expected '\"' in string literal");
James Molloy	3cdb8aa	2018-08-14 01:16:45 -0700	[diff] [blame]	341	case '\\':
Chris Lattner	0497c4b	2018-08-15 09:09:54 -0700	[diff] [blame]	342	// Handle explicitly a few escapes.
				343	if (curPtr == '"' \|\| curPtr == '\\' \|\| curPtr == 'n' \|\| curPtr == 't')
James Molloy	3cdb8aa	2018-08-14 01:16:45 -0700	[diff] [blame]	344	++curPtr;
Chris Lattner	0497c4b	2018-08-15 09:09:54 -0700	[diff] [blame]	345	else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
				346	// Support \xx for two hex digits.
				347	curPtr += 2;
				348	else
				349	return emitError(curPtr - 1, "unknown escape in string literal");
James Molloy	3cdb8aa	2018-08-14 01:16:45 -0700	[diff] [blame]	350	continue;
Chris Lattner	ed65a73	2018-06-28 20:45:33 -0700	[diff] [blame]	351
				352	default:
				353	continue;
				354	}
				355	}
				356	}