blob: fcee028d3ae0d453f97badfd8d05028e976628dd [file] [log] [blame]
Chris Lattnerda4ab672007-11-18 02:57:27 +00001//===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Chris Lattnerda4ab672007-11-18 02:57:27 +00006//
7//===----------------------------------------------------------------------===//
8//
9// This class represents the Lexer for tablegen files.
10//
11//===----------------------------------------------------------------------===//
12
Benjamin Kramera7c40ef2014-08-13 16:26:38 +000013#ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14#define LLVM_LIB_TABLEGEN_TGLEXER_H
Chris Lattnerda4ab672007-11-18 02:57:27 +000015
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000016#include "llvm/ADT/ArrayRef.h"
Rafael Espindolaa3c65092014-07-06 14:24:03 +000017#include "llvm/ADT/StringRef.h"
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000018#include "llvm/ADT/StringSet.h"
Michael J. Spencerab425d82010-11-29 18:47:54 +000019#include "llvm/Support/DataTypes.h"
Sean Silva3b964242013-02-07 04:30:39 +000020#include "llvm/Support/SMLoc.h"
Chandler Carruth802d7552012-12-04 07:12:27 +000021#include <cassert>
Sean Silva3b964242013-02-07 04:30:39 +000022#include <map>
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000023#include <memory>
Chris Lattnerda4ab672007-11-18 02:57:27 +000024#include <string>
Chris Lattnerda4ab672007-11-18 02:57:27 +000025
26namespace llvm {
Chris Lattnerfd255752009-06-21 03:41:50 +000027class SourceMgr;
Chris Lattner526c8cb2009-06-21 03:39:35 +000028class SMLoc;
Benjamin Kramerc7583112010-09-27 17:42:11 +000029class Twine;
30
Chris Lattnerf4127dd2007-11-22 20:49:04 +000031namespace tgtok {
32 enum TokKind {
33 // Markers
34 Eof, Error,
Nicolai Haehnle169ec092018-03-09 18:32:04 +000035
Chris Lattnerf4127dd2007-11-22 20:49:04 +000036 // Tokens with no info.
37 minus, plus, // - +
38 l_square, r_square, // [ ]
39 l_brace, r_brace, // { }
40 l_paren, r_paren, // ( )
41 less, greater, // < >
Francois Pichet0fc06ee2011-03-14 02:30:32 +000042 colon, semi, // : ;
Chris Lattnerf4127dd2007-11-22 20:49:04 +000043 comma, period, // , .
44 equal, question, // = ?
David Greene8e85b482011-10-19 13:04:43 +000045 paste, // #
46
Chris Lattnerf4127dd2007-11-22 20:49:04 +000047 // Keywords.
David Greenefb927af2012-02-22 16:09:41 +000048 Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List,
Nicolai Haehnlefcd65252018-03-09 12:24:42 +000049 MultiClass, String, Defset,
Matt Arsenault1c8d9332016-11-15 06:49:28 +000050
Chris Lattnerf4127dd2007-11-22 20:49:04 +000051 // !keywords.
Matt Arsenault1c8d9332016-11-15 06:49:28 +000052 XConcat, XADD, XAND, XOR, XSRA, XSRL, XSHL, XListConcat, XStrConcat, XCast,
Nicolai Haehnle6c118652018-03-14 11:00:26 +000053 XSubst, XForEach, XFoldl, XHead, XTail, XSize, XEmpty, XIf, XEq, XIsA, XDag,
Nicolai Haehnleaa9ca692018-03-14 11:00:57 +000054 XNe, XLe, XLt, XGe, XGt,
David Greenea9c6c5d2009-04-22 20:18:10 +000055
Chris Lattnerf4127dd2007-11-22 20:49:04 +000056 // Integer value.
57 IntVal,
Pete Cooper25977642014-08-07 05:47:00 +000058
59 // Binary constant. Note that these are sized according to the number of
60 // bits given.
61 BinaryIntVal,
Nicolai Haehnle169ec092018-03-09 18:32:04 +000062
Chris Lattnerf4127dd2007-11-22 20:49:04 +000063 // String valued tokens.
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000064 Id, StrVal, VarName, CodeFragment,
65
66 // Preprocessing tokens for internal usage by the lexer.
67 // They are never returned as a result of Lex().
68 Ifdef, Else, Endif, Define
Chris Lattnerf4127dd2007-11-22 20:49:04 +000069 };
Alexander Kornienkof00654e2015-06-23 09:49:53 +000070}
Chris Lattnerda4ab672007-11-18 02:57:27 +000071
Chris Lattnerf4127dd2007-11-22 20:49:04 +000072/// TGLexer - TableGen Lexer class.
Chris Lattnerda4ab672007-11-18 02:57:27 +000073class TGLexer {
Chris Lattnerfd255752009-06-21 03:41:50 +000074 SourceMgr &SrcMgr;
Nicolai Haehnle169ec092018-03-09 18:32:04 +000075
Chris Lattnerda4ab672007-11-18 02:57:27 +000076 const char *CurPtr;
Rafael Espindolaa3c65092014-07-06 14:24:03 +000077 StringRef CurBuf;
Chris Lattnerda4ab672007-11-18 02:57:27 +000078
Chris Lattnerf4127dd2007-11-22 20:49:04 +000079 // Information about the current token.
80 const char *TokStart;
81 tgtok::TokKind CurCode;
82 std::string CurStrVal; // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT
Dan Gohmanca0546f2008-10-17 01:33:43 +000083 int64_t CurIntVal; // This is valid for INTVAL.
Chris Lattner8db9bc72009-03-13 07:05:43 +000084
85 /// CurBuffer - This is the current buffer index we're lexing from as managed
86 /// by the SourceMgr object.
Alp Tokera55b95b2014-07-06 10:33:31 +000087 unsigned CurBuffer;
Sean Silva3b964242013-02-07 04:30:39 +000088
89public:
90 typedef std::map<std::string, SMLoc> DependenciesMapTy;
91private:
Joerg Sonnenbergeraf5f23e2011-06-01 13:10:15 +000092 /// Dependencies - This is the list of all included files.
Sean Silva3b964242013-02-07 04:30:39 +000093 DependenciesMapTy Dependencies;
94
Chris Lattnerda4ab672007-11-18 02:57:27 +000095public:
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000096 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
Benjamin Kramerdd0ff852015-04-11 15:32:26 +000097
Chris Lattnerf4127dd2007-11-22 20:49:04 +000098 tgtok::TokKind Lex() {
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000099 return CurCode = LexToken(CurPtr == CurBuf.begin());
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000100 }
Joerg Sonnenbergeraf5f23e2011-06-01 13:10:15 +0000101
Sean Silva3b964242013-02-07 04:30:39 +0000102 const DependenciesMapTy &getDependencies() const {
Joerg Sonnenbergeraf5f23e2011-06-01 13:10:15 +0000103 return Dependencies;
104 }
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000105
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000106 tgtok::TokKind getCode() const { return CurCode; }
107
108 const std::string &getCurStrVal() const {
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000109 assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000110 CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
111 "This token doesn't have a string value");
112 return CurStrVal;
113 }
Dan Gohmanca0546f2008-10-17 01:33:43 +0000114 int64_t getCurIntVal() const {
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000115 assert(CurCode == tgtok::IntVal && "This token isn't an integer");
116 return CurIntVal;
117 }
Pete Cooper25977642014-08-07 05:47:00 +0000118 std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
119 assert(CurCode == tgtok::BinaryIntVal &&
120 "This token isn't a binary integer");
121 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
122 }
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000123
Chris Lattner526c8cb2009-06-21 03:39:35 +0000124 SMLoc getLoc() const;
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000125
Chris Lattnerda4ab672007-11-18 02:57:27 +0000126private:
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000127 /// LexToken - Read the next token and return its code.
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +0000128 tgtok::TokKind LexToken(bool FileOrLineStart = false);
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000129
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +0000130 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
Benjamin Kramerc7583112010-09-27 17:42:11 +0000131 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000132
Chris Lattnerda4ab672007-11-18 02:57:27 +0000133 int getNextChar();
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +0000134 int peekNextChar(int Index) const;
Chris Lattnerda4ab672007-11-18 02:57:27 +0000135 void SkipBCPLComment();
136 bool SkipCComment();
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000137 tgtok::TokKind LexIdentifier();
Chris Lattnerda4ab672007-11-18 02:57:27 +0000138 bool LexInclude();
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000139 tgtok::TokKind LexString();
140 tgtok::TokKind LexVarName();
141 tgtok::TokKind LexNumber();
142 tgtok::TokKind LexBracket();
143 tgtok::TokKind LexExclaim();
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +0000144
145 // Process EOF encountered in LexToken().
146 // If EOF is met in an include file, then the method will update
147 // CurPtr, CurBuf and preprocessing include stack, and return true.
148 // If EOF is met in the top-level file, then the method will
149 // update and check the preprocessing include stack, and return false.
150 bool processEOF();
151
152 // *** Structures and methods for preprocessing support ***
153
154 // A set of macro names that are defined either via command line or
155 // by using:
156 // #define NAME
157 StringSet<> DefinedMacros;
158
159 // Each of #ifdef and #else directives has a descriptor associated
160 // with it.
161 //
162 // An ordered list of preprocessing controls defined by #ifdef/#else
163 // directives that are in effect currently is called preprocessing
164 // control stack. It is represented as a vector of PreprocessorControlDesc's.
165 //
166 // The control stack is updated according to the following rules:
167 //
168 // For each #ifdef we add an element to the control stack.
169 // For each #else we replace the top element with a descriptor
170 // with an inverted IsDefined value.
171 // For each #endif we pop the top element from the control stack.
172 //
173 // When CurPtr reaches the current buffer's end, the control stack
174 // must be empty, i.e. #ifdef and the corresponding #endif
175 // must be located in the same file.
176 struct PreprocessorControlDesc {
177 // Either tgtok::Ifdef or tgtok::Else.
178 tgtok::TokKind Kind;
179
180 // True, if the condition for this directive is true, false - otherwise.
181 // Examples:
182 // #ifdef NAME : true, if NAME is defined, false - otherwise.
183 // ...
184 // #else : false, if NAME is defined, true - otherwise.
185 bool IsDefined;
186
187 // Pointer into CurBuf to the beginning of the preprocessing directive
188 // word, e.g.:
189 // #ifdef NAME
190 // ^ - SrcPos
191 SMLoc SrcPos;
192 };
193
194 // We want to disallow code like this:
195 // file1.td:
196 // #define NAME
197 // #ifdef NAME
198 // include "file2.td"
199 // EOF
200 // file2.td:
201 // #endif
202 // EOF
203 //
204 // To do this, we clear the preprocessing control stack on entry
205 // to each of the included file. PrepIncludeStack is used to store
206 // preprocessing control stacks for the current file and all its
207 // parent files. The back() element is the preprocessing control
208 // stack for the current file.
209 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
210 PrepIncludeStack;
211
212 // Validate that the current preprocessing control stack is empty,
213 // since we are about to exit a file, and pop the include stack.
214 //
215 // If IncludeStackMustBeEmpty is true, the include stack must be empty
216 // after the popping, otherwise, the include stack must not be empty
217 // after the popping. Basically, the include stack must be empty
218 // only if we exit the "top-level" file (i.e. finish lexing).
219 //
220 // The method returns false, if the current preprocessing control stack
221 // is not empty (e.g. there is an unterminated #ifdef/#else),
222 // true - otherwise.
223 bool prepExitInclude(bool IncludeStackMustBeEmpty);
224
225 // Look ahead for a preprocessing directive starting from CurPtr. The caller
226 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches
227 // a preprocessing directive word followed by a whitespace, then it returns
228 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
229 //
230 // CurPtr is not adjusted by this method.
231 tgtok::TokKind prepIsDirective() const;
232
233 // Given a preprocessing token kind, adjusts CurPtr to the end
234 // of the preprocessing directive word. Returns true, unless
235 // an unsupported token kind is passed in.
236 //
237 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
238 // to avoid adjusting CurPtr before we are sure that '#' is followed
239 // by a preprocessing directive. If it is not, then we fall back to
240 // tgtok::paste interpretation of '#'.
241 bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
242
243 // The main "exit" point from the token parsing to preprocessor.
244 //
245 // The method is called for CurPtr, when prepIsDirective() returns
246 // true. The first parameter matches the result of prepIsDirective(),
247 // denoting the actual preprocessor directive to be processed.
248 //
249 // If the preprocessing directive disables the tokens processing, e.g.:
250 // #ifdef NAME // NAME is undefined
251 // then lexPreprocessor() enters the lines-skipping mode.
252 // In this mode, it does not parse any tokens, because the code under
253 // the #ifdef may not even be a correct tablegen code. The preprocessor
254 // looks for lines containing other preprocessing directives, which
255 // may be prepended with whitespaces and C-style comments. If the line
256 // does not contain a preprocessing directive, it is skipped completely.
257 // Otherwise, the preprocessing directive is processed by recursively
258 // calling lexPreprocessor(). The processing of the encountered
259 // preprocessing directives includes updating preprocessing control stack
260 // and adding new macros into DefinedMacros set.
261 //
262 // The second parameter controls whether lexPreprocessor() is called from
263 // LexToken() (true) or recursively from lexPreprocessor() (false).
264 //
265 // If ReturnNextLiveToken is true, the method returns the next
266 // LEX token following the current directive or following the end
267 // of the disabled preprocessing region corresponding to this directive.
268 // If ReturnNextLiveToken is false, the method returns the first parameter,
269 // unless there were errors encountered in the disabled preprocessing
270 // region - in this case, it returns tgtok::Error.
271 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
272 bool ReturnNextLiveToken = true);
273
274 // Worker method for lexPreprocessor() to skip lines after some
275 // preprocessing directive up to the buffer end or to the directive
276 // that re-enables token processing. The method returns true
277 // upon processing the next directive that re-enables tokens
278 // processing. False is returned if an error was encountered.
279 //
280 // Note that prepSkipRegion() calls lexPreprocessor() to process
281 // encountered preprocessing directives. In this case, the second
282 // parameter to lexPreprocessor() is set to false. Being passed
283 // false ReturnNextLiveToken, lexPreprocessor() must never call
284 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken
285 // to prepSkipRegion() and checking that it is never set to false.
286 bool prepSkipRegion(bool MustNeverBeFalse);
287
288 // Lex name of the macro after either #ifdef or #define. We could have used
289 // LexIdentifier(), but it has special handling of "include" word, which
290 // could result in awkward diagnostic errors. Consider:
291 // ----
292 // #ifdef include
293 // class ...
294 // ----
295 // LexIdentifier() will engage LexInclude(), which will complain about
296 // missing file with name "class". Instead, prepLexMacroName() will treat
297 // "include" as a normal macro name.
298 //
299 // On entry, CurPtr points to the end of a preprocessing directive word.
300 // The method allows for whitespaces between the preprocessing directive
301 // and the macro name. The allowed whitespaces are ' ' and '\t'.
302 //
303 // If the first non-whitespace symbol after the preprocessing directive
304 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
305 // the method updates TokStart to the position of the first non-whitespace
306 // symbol, sets CurPtr to the position of the macro name's last symbol,
307 // and returns a string reference to the macro name. Otherwise,
308 // TokStart is set to the first non-whitespace symbol after the preprocessing
309 // directive, and the method returns an empty string reference.
310 //
311 // In all cases, TokStart may be used to point to the word following
312 // the preprocessing directive.
313 StringRef prepLexMacroName();
314
315 // Skip any whitespaces starting from CurPtr. The method is used
316 // only in the lines-skipping mode to find the first non-whitespace
317 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n'
318 // and '\r'. The method skips C-style comments as well, because
319 // it is used to find the beginning of the preprocessing directive.
320 // If we do not handle C-style comments the following code would
321 // result in incorrect detection of a preprocessing directive:
322 // /*
323 // #ifdef NAME
324 // */
325 // As long as we skip C-style comments, the following code is correctly
326 // recognized as a preprocessing directive:
327 // /* first line comment
328 // second line comment */ #ifdef NAME
329 //
330 // The method returns true upon reaching the first non-whitespace symbol
331 // or EOF, CurPtr is set to point to this symbol. The method returns false,
332 // if an error occured during skipping of a C-style comment.
333 bool prepSkipLineBegin();
334
335 // Skip any whitespaces or comments after a preprocessing directive.
336 // The method returns true upon reaching either end of the line
337 // or end of the file. If there is a multiline C-style comment
338 // after the preprocessing directive, the method skips
339 // the comment, so the final CurPtr may point to one of the next lines.
340 // The method returns false, if an error occured during skipping
341 // C- or C++-style comment, or a non-whitespace symbol appears
342 // after the preprocessing directive.
343 //
344 // The method maybe called both during lines-skipping and tokens
345 // processing. It actually verifies that only whitespaces or/and
346 // comments follow a preprocessing directive.
347 //
348 // After the execution of this mehod, CurPtr points either to new line
349 // symbol, buffer end or non-whitespace symbol following the preprocesing
350 // directive.
351 bool prepSkipDirectiveEnd();
352
353 // Skip all symbols to the end of the line/file.
354 // The method adjusts CurPtr, so that it points to either new line
355 // symbol in the current line or the buffer end.
356 void prepSkipToLineEnd();
357
358 // Return true, if the current preprocessor control stack is such that
359 // we should allow lexer to process the next token, false - otherwise.
360 //
361 // In particular, the method returns true, if all the #ifdef/#else
362 // controls on the stack have their IsDefined member set to true.
363 bool prepIsProcessingEnabled();
364
365 // Report an error, if we reach EOF with non-empty preprocessing control
366 // stack. This means there is no matching #endif for the previous
367 // #ifdef/#else.
368 void prepReportPreprocessorStackError();
Chris Lattnerda4ab672007-11-18 02:57:27 +0000369};
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000370
Chris Lattnerda4ab672007-11-18 02:57:27 +0000371} // end namespace llvm
372
373#endif