blob: 3085ab2c04785c00379436977462beeba80937ec [file] [log] [blame]
Chris Lattnerda4ab672007-11-18 02:57:27 +00001//===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Chris Lattnerda4ab672007-11-18 02:57:27 +00006//
7//===----------------------------------------------------------------------===//
8//
9// This class represents the Lexer for tablegen files.
10//
11//===----------------------------------------------------------------------===//
12
Benjamin Kramera7c40ef2014-08-13 16:26:38 +000013#ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14#define LLVM_LIB_TABLEGEN_TGLEXER_H
Chris Lattnerda4ab672007-11-18 02:57:27 +000015
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000016#include "llvm/ADT/ArrayRef.h"
Rafael Espindolaa3c65092014-07-06 14:24:03 +000017#include "llvm/ADT/StringRef.h"
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000018#include "llvm/ADT/StringSet.h"
Michael J. Spencerab425d82010-11-29 18:47:54 +000019#include "llvm/Support/DataTypes.h"
Sean Silva3b964242013-02-07 04:30:39 +000020#include "llvm/Support/SMLoc.h"
Chandler Carruth802d7552012-12-04 07:12:27 +000021#include <cassert>
Sean Silva3b964242013-02-07 04:30:39 +000022#include <map>
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000023#include <memory>
Chris Lattnerda4ab672007-11-18 02:57:27 +000024#include <string>
Chris Lattnerda4ab672007-11-18 02:57:27 +000025
26namespace llvm {
Chris Lattnerfd255752009-06-21 03:41:50 +000027class SourceMgr;
Chris Lattner526c8cb2009-06-21 03:39:35 +000028class SMLoc;
Benjamin Kramerc7583112010-09-27 17:42:11 +000029class Twine;
30
Chris Lattnerf4127dd2007-11-22 20:49:04 +000031namespace tgtok {
32 enum TokKind {
33 // Markers
34 Eof, Error,
Nicolai Haehnle169ec092018-03-09 18:32:04 +000035
Chris Lattnerf4127dd2007-11-22 20:49:04 +000036 // Tokens with no info.
37 minus, plus, // - +
38 l_square, r_square, // [ ]
39 l_brace, r_brace, // { }
40 l_paren, r_paren, // ( )
41 less, greater, // < >
Francois Pichet0fc06ee2011-03-14 02:30:32 +000042 colon, semi, // : ;
Chris Lattnerf4127dd2007-11-22 20:49:04 +000043 comma, period, // , .
44 equal, question, // = ?
David Greene8e85b482011-10-19 13:04:43 +000045 paste, // #
46
Chris Lattnerf4127dd2007-11-22 20:49:04 +000047 // Keywords.
David Greenefb927af2012-02-22 16:09:41 +000048 Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List,
Nicolai Haehnlefcd65252018-03-09 12:24:42 +000049 MultiClass, String, Defset,
Matt Arsenault1c8d9332016-11-15 06:49:28 +000050
Chris Lattnerf4127dd2007-11-22 20:49:04 +000051 // !keywords.
Roman Lebedev5d9f6562019-04-10 18:26:36 +000052 XConcat, XADD, XMUL, XAND, XOR, XSRA, XSRL, XSHL, XListConcat, XListSplat,
53 XStrConcat, XCast, XSubst, XForEach, XFoldl, XHead, XTail, XSize, XEmpty,
54 XIf, XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt,
David Greenea9c6c5d2009-04-22 20:18:10 +000055
Chris Lattnerf4127dd2007-11-22 20:49:04 +000056 // Integer value.
57 IntVal,
Pete Cooper25977642014-08-07 05:47:00 +000058
59 // Binary constant. Note that these are sized according to the number of
60 // bits given.
61 BinaryIntVal,
Nicolai Haehnle169ec092018-03-09 18:32:04 +000062
Chris Lattnerf4127dd2007-11-22 20:49:04 +000063 // String valued tokens.
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000064 Id, StrVal, VarName, CodeFragment,
65
66 // Preprocessing tokens for internal usage by the lexer.
67 // They are never returned as a result of Lex().
Tim Northover717b62a2019-05-14 13:04:25 +000068 Ifdef, Ifndef, Else, Endif, Define
Chris Lattnerf4127dd2007-11-22 20:49:04 +000069 };
Alexander Kornienkof00654e2015-06-23 09:49:53 +000070}
Chris Lattnerda4ab672007-11-18 02:57:27 +000071
Chris Lattnerf4127dd2007-11-22 20:49:04 +000072/// TGLexer - TableGen Lexer class.
Chris Lattnerda4ab672007-11-18 02:57:27 +000073class TGLexer {
Chris Lattnerfd255752009-06-21 03:41:50 +000074 SourceMgr &SrcMgr;
Nicolai Haehnle169ec092018-03-09 18:32:04 +000075
Chris Lattnerda4ab672007-11-18 02:57:27 +000076 const char *CurPtr;
Rafael Espindolaa3c65092014-07-06 14:24:03 +000077 StringRef CurBuf;
Chris Lattnerda4ab672007-11-18 02:57:27 +000078
Chris Lattnerf4127dd2007-11-22 20:49:04 +000079 // Information about the current token.
80 const char *TokStart;
81 tgtok::TokKind CurCode;
82 std::string CurStrVal; // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT
Dan Gohmanca0546f2008-10-17 01:33:43 +000083 int64_t CurIntVal; // This is valid for INTVAL.
Chris Lattner8db9bc72009-03-13 07:05:43 +000084
85 /// CurBuffer - This is the current buffer index we're lexing from as managed
86 /// by the SourceMgr object.
Alp Tokera55b95b2014-07-06 10:33:31 +000087 unsigned CurBuffer;
Sean Silva3b964242013-02-07 04:30:39 +000088
89public:
90 typedef std::map<std::string, SMLoc> DependenciesMapTy;
91private:
Joerg Sonnenbergeraf5f23e2011-06-01 13:10:15 +000092 /// Dependencies - This is the list of all included files.
Sean Silva3b964242013-02-07 04:30:39 +000093 DependenciesMapTy Dependencies;
94
Chris Lattnerda4ab672007-11-18 02:57:27 +000095public:
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000096 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
Benjamin Kramerdd0ff852015-04-11 15:32:26 +000097
Chris Lattnerf4127dd2007-11-22 20:49:04 +000098 tgtok::TokKind Lex() {
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +000099 return CurCode = LexToken(CurPtr == CurBuf.begin());
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000100 }
Joerg Sonnenbergeraf5f23e2011-06-01 13:10:15 +0000101
Sean Silva3b964242013-02-07 04:30:39 +0000102 const DependenciesMapTy &getDependencies() const {
Joerg Sonnenbergeraf5f23e2011-06-01 13:10:15 +0000103 return Dependencies;
104 }
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000105
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000106 tgtok::TokKind getCode() const { return CurCode; }
107
108 const std::string &getCurStrVal() const {
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000109 assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000110 CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
111 "This token doesn't have a string value");
112 return CurStrVal;
113 }
Dan Gohmanca0546f2008-10-17 01:33:43 +0000114 int64_t getCurIntVal() const {
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000115 assert(CurCode == tgtok::IntVal && "This token isn't an integer");
116 return CurIntVal;
117 }
Pete Cooper25977642014-08-07 05:47:00 +0000118 std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
119 assert(CurCode == tgtok::BinaryIntVal &&
120 "This token isn't a binary integer");
121 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
122 }
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000123
Chris Lattner526c8cb2009-06-21 03:39:35 +0000124 SMLoc getLoc() const;
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000125
Chris Lattnerda4ab672007-11-18 02:57:27 +0000126private:
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000127 /// LexToken - Read the next token and return its code.
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +0000128 tgtok::TokKind LexToken(bool FileOrLineStart = false);
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000129
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +0000130 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
Benjamin Kramerc7583112010-09-27 17:42:11 +0000131 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000132
Chris Lattnerda4ab672007-11-18 02:57:27 +0000133 int getNextChar();
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +0000134 int peekNextChar(int Index) const;
Chris Lattnerda4ab672007-11-18 02:57:27 +0000135 void SkipBCPLComment();
136 bool SkipCComment();
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000137 tgtok::TokKind LexIdentifier();
Chris Lattnerda4ab672007-11-18 02:57:27 +0000138 bool LexInclude();
Chris Lattnerf4127dd2007-11-22 20:49:04 +0000139 tgtok::TokKind LexString();
140 tgtok::TokKind LexVarName();
141 tgtok::TokKind LexNumber();
142 tgtok::TokKind LexBracket();
143 tgtok::TokKind LexExclaim();
Vyacheslav Zakharinf7d079e2018-11-27 18:57:43 +0000144
145 // Process EOF encountered in LexToken().
146 // If EOF is met in an include file, then the method will update
147 // CurPtr, CurBuf and preprocessing include stack, and return true.
148 // If EOF is met in the top-level file, then the method will
149 // update and check the preprocessing include stack, and return false.
150 bool processEOF();
151
152 // *** Structures and methods for preprocessing support ***
153
154 // A set of macro names that are defined either via command line or
155 // by using:
156 // #define NAME
157 StringSet<> DefinedMacros;
158
159 // Each of #ifdef and #else directives has a descriptor associated
160 // with it.
161 //
162 // An ordered list of preprocessing controls defined by #ifdef/#else
163 // directives that are in effect currently is called preprocessing
164 // control stack. It is represented as a vector of PreprocessorControlDesc's.
165 //
166 // The control stack is updated according to the following rules:
167 //
168 // For each #ifdef we add an element to the control stack.
169 // For each #else we replace the top element with a descriptor
170 // with an inverted IsDefined value.
171 // For each #endif we pop the top element from the control stack.
172 //
173 // When CurPtr reaches the current buffer's end, the control stack
174 // must be empty, i.e. #ifdef and the corresponding #endif
175 // must be located in the same file.
176 struct PreprocessorControlDesc {
177 // Either tgtok::Ifdef or tgtok::Else.
178 tgtok::TokKind Kind;
179
180 // True, if the condition for this directive is true, false - otherwise.
181 // Examples:
182 // #ifdef NAME : true, if NAME is defined, false - otherwise.
183 // ...
184 // #else : false, if NAME is defined, true - otherwise.
185 bool IsDefined;
186
187 // Pointer into CurBuf to the beginning of the preprocessing directive
188 // word, e.g.:
189 // #ifdef NAME
190 // ^ - SrcPos
191 SMLoc SrcPos;
192 };
193
194 // We want to disallow code like this:
195 // file1.td:
196 // #define NAME
197 // #ifdef NAME
198 // include "file2.td"
199 // EOF
200 // file2.td:
201 // #endif
202 // EOF
203 //
204 // To do this, we clear the preprocessing control stack on entry
205 // to each of the included file. PrepIncludeStack is used to store
206 // preprocessing control stacks for the current file and all its
207 // parent files. The back() element is the preprocessing control
208 // stack for the current file.
209 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
210 PrepIncludeStack;
211
212 // Validate that the current preprocessing control stack is empty,
213 // since we are about to exit a file, and pop the include stack.
214 //
215 // If IncludeStackMustBeEmpty is true, the include stack must be empty
216 // after the popping, otherwise, the include stack must not be empty
217 // after the popping. Basically, the include stack must be empty
218 // only if we exit the "top-level" file (i.e. finish lexing).
219 //
220 // The method returns false, if the current preprocessing control stack
221 // is not empty (e.g. there is an unterminated #ifdef/#else),
222 // true - otherwise.
223 bool prepExitInclude(bool IncludeStackMustBeEmpty);
224
225 // Look ahead for a preprocessing directive starting from CurPtr. The caller
226 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches
227 // a preprocessing directive word followed by a whitespace, then it returns
228 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
229 //
230 // CurPtr is not adjusted by this method.
231 tgtok::TokKind prepIsDirective() const;
232
233 // Given a preprocessing token kind, adjusts CurPtr to the end
234 // of the preprocessing directive word. Returns true, unless
235 // an unsupported token kind is passed in.
236 //
237 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
238 // to avoid adjusting CurPtr before we are sure that '#' is followed
239 // by a preprocessing directive. If it is not, then we fall back to
240 // tgtok::paste interpretation of '#'.
241 bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
242
243 // The main "exit" point from the token parsing to preprocessor.
244 //
245 // The method is called for CurPtr, when prepIsDirective() returns
246 // true. The first parameter matches the result of prepIsDirective(),
247 // denoting the actual preprocessor directive to be processed.
248 //
249 // If the preprocessing directive disables the tokens processing, e.g.:
250 // #ifdef NAME // NAME is undefined
251 // then lexPreprocessor() enters the lines-skipping mode.
252 // In this mode, it does not parse any tokens, because the code under
253 // the #ifdef may not even be a correct tablegen code. The preprocessor
254 // looks for lines containing other preprocessing directives, which
255 // may be prepended with whitespaces and C-style comments. If the line
256 // does not contain a preprocessing directive, it is skipped completely.
257 // Otherwise, the preprocessing directive is processed by recursively
258 // calling lexPreprocessor(). The processing of the encountered
259 // preprocessing directives includes updating preprocessing control stack
260 // and adding new macros into DefinedMacros set.
261 //
262 // The second parameter controls whether lexPreprocessor() is called from
263 // LexToken() (true) or recursively from lexPreprocessor() (false).
264 //
265 // If ReturnNextLiveToken is true, the method returns the next
266 // LEX token following the current directive or following the end
267 // of the disabled preprocessing region corresponding to this directive.
268 // If ReturnNextLiveToken is false, the method returns the first parameter,
269 // unless there were errors encountered in the disabled preprocessing
270 // region - in this case, it returns tgtok::Error.
271 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
272 bool ReturnNextLiveToken = true);
273
274 // Worker method for lexPreprocessor() to skip lines after some
275 // preprocessing directive up to the buffer end or to the directive
276 // that re-enables token processing. The method returns true
277 // upon processing the next directive that re-enables tokens
278 // processing. False is returned if an error was encountered.
279 //
280 // Note that prepSkipRegion() calls lexPreprocessor() to process
281 // encountered preprocessing directives. In this case, the second
282 // parameter to lexPreprocessor() is set to false. Being passed
283 // false ReturnNextLiveToken, lexPreprocessor() must never call
284 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken
285 // to prepSkipRegion() and checking that it is never set to false.
286 bool prepSkipRegion(bool MustNeverBeFalse);
287
288 // Lex name of the macro after either #ifdef or #define. We could have used
289 // LexIdentifier(), but it has special handling of "include" word, which
290 // could result in awkward diagnostic errors. Consider:
291 // ----
292 // #ifdef include
293 // class ...
294 // ----
295 // LexIdentifier() will engage LexInclude(), which will complain about
296 // missing file with name "class". Instead, prepLexMacroName() will treat
297 // "include" as a normal macro name.
298 //
299 // On entry, CurPtr points to the end of a preprocessing directive word.
300 // The method allows for whitespaces between the preprocessing directive
301 // and the macro name. The allowed whitespaces are ' ' and '\t'.
302 //
303 // If the first non-whitespace symbol after the preprocessing directive
304 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
305 // the method updates TokStart to the position of the first non-whitespace
306 // symbol, sets CurPtr to the position of the macro name's last symbol,
307 // and returns a string reference to the macro name. Otherwise,
308 // TokStart is set to the first non-whitespace symbol after the preprocessing
309 // directive, and the method returns an empty string reference.
310 //
311 // In all cases, TokStart may be used to point to the word following
312 // the preprocessing directive.
313 StringRef prepLexMacroName();
314
315 // Skip any whitespaces starting from CurPtr. The method is used
316 // only in the lines-skipping mode to find the first non-whitespace
317 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n'
318 // and '\r'. The method skips C-style comments as well, because
319 // it is used to find the beginning of the preprocessing directive.
320 // If we do not handle C-style comments the following code would
321 // result in incorrect detection of a preprocessing directive:
322 // /*
323 // #ifdef NAME
324 // */
325 // As long as we skip C-style comments, the following code is correctly
326 // recognized as a preprocessing directive:
327 // /* first line comment
328 // second line comment */ #ifdef NAME
329 //
330 // The method returns true upon reaching the first non-whitespace symbol
331 // or EOF, CurPtr is set to point to this symbol. The method returns false,
332 // if an error occured during skipping of a C-style comment.
333 bool prepSkipLineBegin();
334
335 // Skip any whitespaces or comments after a preprocessing directive.
336 // The method returns true upon reaching either end of the line
337 // or end of the file. If there is a multiline C-style comment
338 // after the preprocessing directive, the method skips
339 // the comment, so the final CurPtr may point to one of the next lines.
340 // The method returns false, if an error occured during skipping
341 // C- or C++-style comment, or a non-whitespace symbol appears
342 // after the preprocessing directive.
343 //
344 // The method maybe called both during lines-skipping and tokens
345 // processing. It actually verifies that only whitespaces or/and
346 // comments follow a preprocessing directive.
347 //
348 // After the execution of this mehod, CurPtr points either to new line
349 // symbol, buffer end or non-whitespace symbol following the preprocesing
350 // directive.
351 bool prepSkipDirectiveEnd();
352
353 // Skip all symbols to the end of the line/file.
354 // The method adjusts CurPtr, so that it points to either new line
355 // symbol in the current line or the buffer end.
356 void prepSkipToLineEnd();
357
358 // Return true, if the current preprocessor control stack is such that
359 // we should allow lexer to process the next token, false - otherwise.
360 //
361 // In particular, the method returns true, if all the #ifdef/#else
362 // controls on the stack have their IsDefined member set to true.
363 bool prepIsProcessingEnabled();
364
365 // Report an error, if we reach EOF with non-empty preprocessing control
366 // stack. This means there is no matching #endif for the previous
367 // #ifdef/#else.
368 void prepReportPreprocessorStackError();
Chris Lattnerda4ab672007-11-18 02:57:27 +0000369};
Nicolai Haehnle169ec092018-03-09 18:32:04 +0000370
Chris Lattnerda4ab672007-11-18 02:57:27 +0000371} // end namespace llvm
372
373#endif