blob: dd7ad6ceb101df796e6e2c7b3f040e39dabb0004 [file] [log] [blame]
Chris Lattnera8058742007-11-18 02:57:27 +00001//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file was developed by Chris Lattner and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// Implement the Lexer for TableGen.
11//
12//===----------------------------------------------------------------------===//
13
Chris Lattner6aaca042007-11-18 05:25:45 +000014#include "TGLexer.h"
Chris Lattnera8058742007-11-18 02:57:27 +000015#include "Record.h"
16#include "llvm/Support/Streams.h"
17#include "Record.h"
Chris Lattnera8058742007-11-18 02:57:27 +000018#include "llvm/Support/MemoryBuffer.h"
19typedef std::pair<llvm::Record*, std::vector<llvm::Init*>*> SubClassRefTy;
20#include "FileParser.h"
21#include <cctype>
22using namespace llvm;
23
24// FIXME: REMOVE THIS.
25#define YYEOF 0
26#define YYERROR -2
27
28TGLexer::TGLexer(MemoryBuffer *StartBuf) : CurLineNo(1), CurBuf(StartBuf) {
29 CurPtr = CurBuf->getBufferStart();
30}
31
32TGLexer::~TGLexer() {
33 while (!IncludeStack.empty()) {
34 delete IncludeStack.back().Buffer;
35 IncludeStack.pop_back();
36 }
37 delete CurBuf;
38}
39
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +000040/// ReturnError - Set the error to the specified string at the specified
41/// location. This is defined to always return YYERROR.
42int TGLexer::ReturnError(const char *Loc, const std::string &Msg) {
43 PrintError(Loc, Msg);
44 return YYERROR;
45}
Chris Lattnera8058742007-11-18 02:57:27 +000046
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +000047std::ostream &TGLexer::err() const {
Chris Lattnera8058742007-11-18 02:57:27 +000048 PrintIncludeStack(*cerr.stream());
49 return *cerr.stream();
50}
51
52
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +000053void TGLexer::PrintIncludeStack(std::ostream &OS) const {
Chris Lattnera8058742007-11-18 02:57:27 +000054 for (unsigned i = 0, e = IncludeStack.size(); i != e; ++i)
55 OS << "Included from " << IncludeStack[i].Buffer->getBufferIdentifier()
56 << ":" << IncludeStack[i].LineNo << ":\n";
57 OS << "Parsing " << CurBuf->getBufferIdentifier() << ":"
58 << CurLineNo << ": ";
59}
60
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +000061/// PrintError - Print the error at the specified location.
62void TGLexer::PrintError(const char *ErrorLoc, const std::string &Msg) const {
63 err() << Msg << "\n";
64 assert(ErrorLoc && "Location not specified!");
65
66 // Scan backward to find the start of the line.
67 const char *LineStart = ErrorLoc;
68 while (LineStart != CurBuf->getBufferStart() &&
69 LineStart[-1] != '\n' && LineStart[-1] != '\r')
70 --LineStart;
71 // Get the end of the line.
72 const char *LineEnd = ErrorLoc;
73 while (LineEnd != CurBuf->getBufferEnd() &&
74 LineEnd[0] != '\n' && LineEnd[0] != '\r')
75 ++LineEnd;
76 // Print out the line.
77 cerr << std::string(LineStart, LineEnd) << "\n";
78 // Print out spaces before the carat.
79 const char *Pos = LineStart;
80 while (Pos != ErrorLoc)
81 cerr << (*Pos == '\t' ? '\t' : ' ');
82 cerr << "^\n";
83}
84
Chris Lattnera8058742007-11-18 02:57:27 +000085int TGLexer::getNextChar() {
86 char CurChar = *CurPtr++;
87 switch (CurChar) {
88 default:
Chris Lattnerc1819182007-11-18 05:48:46 +000089 return (unsigned char)CurChar;
Chris Lattnera8058742007-11-18 02:57:27 +000090 case 0:
91 // A nul character in the stream is either the end of the current buffer or
92 // a random nul in the file. Disambiguate that here.
93 if (CurPtr-1 != CurBuf->getBufferEnd())
94 return 0; // Just whitespace.
95
96 // If this is the end of an included file, pop the parent file off the
97 // include stack.
98 if (!IncludeStack.empty()) {
99 delete CurBuf;
100 CurBuf = IncludeStack.back().Buffer;
101 CurLineNo = IncludeStack.back().LineNo;
102 CurPtr = IncludeStack.back().CurPtr;
103 IncludeStack.pop_back();
104 return getNextChar();
105 }
106
107 // Otherwise, return end of file.
108 --CurPtr; // Another call to lex will return EOF again.
109 return EOF;
110 case '\n':
111 case '\r':
112 // Handle the newline character by ignoring it and incrementing the line
113 // count. However, be careful about 'dos style' files with \n\r in them.
114 // Only treat a \n\r or \r\n as a single line.
115 if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
116 *CurPtr != CurChar)
Chris Lattnerc1819182007-11-18 05:48:46 +0000117 ++CurPtr; // Eat the two char newline sequence.
Chris Lattnera8058742007-11-18 02:57:27 +0000118
119 ++CurLineNo;
120 return '\n';
121 }
122}
123
124int TGLexer::LexToken() {
125 // This always consumes at least one character.
126 int CurChar = getNextChar();
127
128 switch (CurChar) {
129 default:
130 // Handle letters: [a-zA-Z_]
131 if (isalpha(CurChar) || CurChar == '_')
132 return LexIdentifier();
133
134 // Unknown character, return the char itself.
135 return (unsigned char)CurChar;
136 case EOF: return YYEOF;
137 case 0:
138 case ' ':
139 case '\t':
140 case '\n':
141 case '\r':
142 // Ignore whitespace.
143 return LexToken();
144 case '/':
145 // If this is the start of a // comment, skip until the end of the line or
146 // the end of the buffer.
147 if (*CurPtr == '/')
148 SkipBCPLComment();
149 else if (*CurPtr == '*') {
150 if (SkipCComment())
151 return YYERROR;
152 } else // Otherwise, return this / as a token.
153 return CurChar;
154 return LexToken();
155 case '-': case '+':
156 case '0': case '1': case '2': case '3': case '4': case '5': case '6':
157 case '7': case '8': case '9':
158 return LexNumber();
159 case '"': return LexString();
160 case '$': return LexVarName();
161 case '[': return LexBracket();
162 case '!': return LexExclaim();
163 }
164}
165
166/// LexString - Lex "[^"]*"
167int TGLexer::LexString() {
168 const char *StrStart = CurPtr;
169
170 while (*CurPtr != '"') {
171 // If we hit the end of the buffer, report an error.
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000172 if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
173 return ReturnError(StrStart, "End of file in string literal");
174
175 if (*CurPtr == '\n' || *CurPtr == '\r')
176 return ReturnError(StrStart, "End of line in string literal");
Chris Lattnera8058742007-11-18 02:57:27 +0000177
178 ++CurPtr;
179 }
180
181 Filelval.StrVal = new std::string(StrStart, CurPtr);
182 ++CurPtr;
183 return STRVAL;
184}
185
186int TGLexer::LexVarName() {
187 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
188 return '$'; // Invalid varname.
189
190 // Otherwise, we're ok, consume the rest of the characters.
191 const char *VarNameStart = CurPtr++;
192
193 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
194 ++CurPtr;
195
196 Filelval.StrVal = new std::string(VarNameStart, CurPtr);
197 return VARNAME;
198}
199
200
201int TGLexer::LexIdentifier() {
202 // The first letter is [a-zA-Z_].
203 const char *IdentStart = CurPtr-1;
204
205 // Match the rest of the identifier regex: [0-9a-zA-Z_]*
206 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
207 ++CurPtr;
208
209 // Check to see if this identifier is a keyword.
210 unsigned Len = CurPtr-IdentStart;
211
212 if (Len == 3 && !memcmp(IdentStart, "int", 3)) return INT;
213 if (Len == 3 && !memcmp(IdentStart, "bit", 3)) return BIT;
214 if (Len == 4 && !memcmp(IdentStart, "bits", 4)) return BITS;
215 if (Len == 6 && !memcmp(IdentStart, "string", 6)) return STRING;
216 if (Len == 4 && !memcmp(IdentStart, "list", 4)) return LIST;
217 if (Len == 4 && !memcmp(IdentStart, "code", 4)) return CODE;
218 if (Len == 3 && !memcmp(IdentStart, "dag", 3)) return DAG;
219
220 if (Len == 5 && !memcmp(IdentStart, "class", 5)) return CLASS;
221 if (Len == 3 && !memcmp(IdentStart, "def", 3)) return DEF;
222 if (Len == 4 && !memcmp(IdentStart, "defm", 4)) return DEFM;
223 if (Len == 10 && !memcmp(IdentStart, "multiclass", 10)) return MULTICLASS;
224 if (Len == 5 && !memcmp(IdentStart, "field", 5)) return FIELD;
225 if (Len == 3 && !memcmp(IdentStart, "let", 3)) return LET;
226 if (Len == 2 && !memcmp(IdentStart, "in", 2)) return IN;
227
228 if (Len == 7 && !memcmp(IdentStart, "include", 7)) {
229 if (LexInclude()) return YYERROR;
230 return LexToken();
231 }
232
233 Filelval.StrVal = new std::string(IdentStart, CurPtr);
234 return ID;
235}
236
237/// LexInclude - We just read the "include" token. Get the string token that
238/// comes next and enter the include.
239bool TGLexer::LexInclude() {
240 // The token after the include must be a string.
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000241 const char *TokStart = CurPtr-7;
Chris Lattnera8058742007-11-18 02:57:27 +0000242 int Tok = LexToken();
243 if (Tok == YYERROR) return true;
244 if (Tok != STRVAL) {
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000245 PrintError(TokStart, "Expected filename after include");
Chris Lattnera8058742007-11-18 02:57:27 +0000246 return true;
247 }
248
249 // Get the string.
250 std::string Filename = *Filelval.StrVal;
251 delete Filelval.StrVal;
252
253 // Try to find the file.
254 MemoryBuffer *NewBuf = MemoryBuffer::getFile(&Filename[0], Filename.size());
255
256 // If the file didn't exist directly, see if it's in an include path.
257 for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBuf; ++i) {
258 std::string IncFile = IncludeDirectories[i] + "/" + Filename;
259 NewBuf = MemoryBuffer::getFile(&IncFile[0], IncFile.size());
260 }
261
262 if (NewBuf == 0) {
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000263 PrintError(TokStart, "Could not find include file '" + Filename + "'");
Chris Lattnera8058742007-11-18 02:57:27 +0000264 return true;
265 }
266
267 // Save the line number and lex buffer of the includer.
268 IncludeStack.push_back(IncludeRec(CurBuf, CurPtr, CurLineNo));
269
270 CurLineNo = 1; // Reset line numbering.
271 CurBuf = NewBuf;
272 CurPtr = CurBuf->getBufferStart();
273 return false;
274}
275
276void TGLexer::SkipBCPLComment() {
277 ++CurPtr; // skip the second slash.
278 while (1) {
279 switch (*CurPtr) {
280 case '\n':
281 case '\r':
282 return; // Newline is end of comment.
283 case 0:
284 // If this is the end of the buffer, end the comment.
285 if (CurPtr == CurBuf->getBufferEnd())
286 return;
287 break;
288 }
289 // Otherwise, skip the character.
290 ++CurPtr;
291 }
292}
293
294/// SkipCComment - This skips C-style /**/ comments. The only difference from C
295/// is that we allow nesting.
296bool TGLexer::SkipCComment() {
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000297 const char *CommentStart = CurPtr-1;
Chris Lattnera8058742007-11-18 02:57:27 +0000298 ++CurPtr; // skip the star.
299 unsigned CommentDepth = 1;
300
301 while (1) {
302 int CurChar = getNextChar();
303 switch (CurChar) {
304 case EOF:
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000305 PrintError(CommentStart, "Unterminated comment!");
Chris Lattnera8058742007-11-18 02:57:27 +0000306 return true;
307 case '*':
308 // End of the comment?
309 if (CurPtr[0] != '/') break;
310
311 ++CurPtr; // End the */.
312 if (--CommentDepth == 0)
313 return false;
314 break;
315 case '/':
316 // Start of a nested comment?
317 if (CurPtr[0] != '*') break;
318 ++CurPtr;
319 ++CommentDepth;
320 break;
321 }
322 }
323}
324
325/// LexNumber - Lex:
326/// [-+]?[0-9]+
327/// 0x[0-9a-fA-F]+
328/// 0b[01]+
329int TGLexer::LexNumber() {
330 const char *NumStart = CurPtr-1;
331
332 if (CurPtr[-1] == '0') {
333 if (CurPtr[0] == 'x') {
334 ++CurPtr;
335 NumStart = CurPtr;
336 while (isxdigit(CurPtr[0]))
337 ++CurPtr;
338
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000339 // Requires at least one hex digit.
340 if (CurPtr == NumStart)
341 return ReturnError(CurPtr-2, "Invalid hexadecimal number");
342
Chris Lattnera8058742007-11-18 02:57:27 +0000343 Filelval.IntVal = strtoll(NumStart, 0, 16);
344 return INTVAL;
345 } else if (CurPtr[0] == 'b') {
346 ++CurPtr;
347 NumStart = CurPtr;
348 while (CurPtr[0] == '0' || CurPtr[0] == '1')
349 ++CurPtr;
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000350
351 // Requires at least one binary digit.
352 if (CurPtr == NumStart)
353 return ReturnError(CurPtr-2, "Invalid binary number");
Chris Lattnera8058742007-11-18 02:57:27 +0000354 Filelval.IntVal = strtoll(NumStart, 0, 2);
355 return INTVAL;
356 }
357 }
358
359 // Check for a sign without a digit.
360 if (CurPtr[-1] == '-' || CurPtr[-1] == '+') {
361 if (!isdigit(CurPtr[0]))
362 return CurPtr[-1];
363 }
364
365 while (isdigit(CurPtr[0]))
366 ++CurPtr;
367 Filelval.IntVal = strtoll(NumStart, 0, 10);
368 return INTVAL;
369}
370
371/// LexBracket - We just read '['. If this is a code block, return it,
372/// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
373int TGLexer::LexBracket() {
374 if (CurPtr[0] != '{')
375 return '[';
376 ++CurPtr;
377 const char *CodeStart = CurPtr;
378 while (1) {
379 int Char = getNextChar();
380 if (Char == EOF) break;
381
382 if (Char != '}') continue;
383
384 Char = getNextChar();
385 if (Char == EOF) break;
386 if (Char == ']') {
387 Filelval.StrVal = new std::string(CodeStart, CurPtr-2);
388 return CODEFRAGMENT;
389 }
390 }
391
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000392 return ReturnError(CodeStart-2, "Unterminated Code Block");
Chris Lattnera8058742007-11-18 02:57:27 +0000393}
394
395/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
396int TGLexer::LexExclaim() {
397 if (!isalpha(*CurPtr))
398 return '!';
399
400 const char *Start = CurPtr++;
401 while (isalpha(*CurPtr))
402 ++CurPtr;
403
404 // Check to see which operator this is.
405 unsigned Len = CurPtr-Start;
406
407 if (Len == 3 && !memcmp(Start, "con", 3)) return CONCATTOK;
408 if (Len == 3 && !memcmp(Start, "sra", 3)) return SRATOK;
409 if (Len == 3 && !memcmp(Start, "srl", 3)) return SRLTOK;
410 if (Len == 3 && !memcmp(Start, "shl", 3)) return SHLTOK;
411 if (Len == 9 && !memcmp(Start, "strconcat", 9)) return STRCONCATTOK;
412
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000413 return ReturnError(Start-1, "Unknown operator");
Chris Lattnera8058742007-11-18 02:57:27 +0000414}
415
416//===----------------------------------------------------------------------===//
417// Interfaces used by the Bison parser.
418//===----------------------------------------------------------------------===//
419
420int Fileparse();
421static TGLexer *TheLexer;
422
423namespace llvm {
424
425std::ostream &err() {
426 return TheLexer->err();
427}
428
429/// ParseFile - this function begins the parsing of the specified tablegen
430/// file.
431///
432void ParseFile(const std::string &Filename,
433 const std::vector<std::string> &IncludeDirs) {
434 std::string ErrorStr;
435 MemoryBuffer *F = MemoryBuffer::getFileOrSTDIN(&Filename[0], Filename.size(),
436 &ErrorStr);
437 if (F == 0) {
438 cerr << "Could not open input file '" + Filename + "': " << ErrorStr <<"\n";
439 exit(1);
440 }
441
442 assert(!TheLexer && "Lexer isn't reentrant yet!");
443 TheLexer = new TGLexer(F);
444
445 // Record the location of the include directory so that the lexer can find
446 // it later.
447 TheLexer->setIncludeDirs(IncludeDirs);
448
449 Fileparse();
450
451 // Cleanup
452 delete TheLexer;
453 TheLexer = 0;
454}
455} // End llvm namespace
456
457
458int Filelex() {
459 assert(TheLexer && "No lexer setup yet!");
460 int Tok = TheLexer->LexToken();
Chris Lattnerc8a9bbc2007-11-19 07:38:58 +0000461 if (Tok == YYERROR)
Chris Lattnera8058742007-11-18 02:57:27 +0000462 exit(1);
Chris Lattnera8058742007-11-18 02:57:27 +0000463 return Tok;
464}