blob: 350885e2c800be50b6644873295bcab0759532c9 [file] [log] [blame]
Chandler Carruth55fc8732012-12-04 09:13:33 +00001//===--- UnwrappedLineParser.h - Format C++ code ----------------*- C++ -*-===//
Daniel Jasperbac016b2012-12-03 18:12:45 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief This file contains the declaration of the UnwrappedLineParser,
12/// which turns a stream of tokens into UnwrappedLines.
13///
Daniel Jasperbac016b2012-12-03 18:12:45 +000014//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CLANG_FORMAT_UNWRAPPED_LINE_PARSER_H
17#define LLVM_CLANG_FORMAT_UNWRAPPED_LINE_PARSER_H
18
Daniel Jasperbac016b2012-12-03 18:12:45 +000019#include "clang/Basic/IdentifierTable.h"
Manuel Klimekb3987012013-05-29 14:47:47 +000020#include "clang/Basic/OperatorPrecedence.h"
Alexander Kornienko15757312012-12-06 18:03:27 +000021#include "clang/Format/Format.h"
Daniel Jasperbac016b2012-12-03 18:12:45 +000022#include "clang/Lex/Lexer.h"
Daniel Jaspercbb6c412013-01-16 09:10:19 +000023#include <list>
Daniel Jasper26f7e782013-01-08 14:56:18 +000024
Daniel Jasperbac016b2012-12-03 18:12:45 +000025namespace clang {
26namespace format {
27
Manuel Klimekb3987012013-05-29 14:47:47 +000028enum TokenType {
29 TT_BinaryOperator,
30 TT_BlockComment,
31 TT_CastRParen,
32 TT_ConditionalExpr,
33 TT_CtorInitializerColon,
34 TT_DesignatedInitializerPeriod,
35 TT_ImplicitStringLiteral,
36 TT_InlineASMColon,
37 TT_InheritanceColon,
38 TT_FunctionTypeLParen,
39 TT_LineComment,
40 TT_ObjCArrayLiteral,
41 TT_ObjCBlockLParen,
42 TT_ObjCDecl,
43 TT_ObjCDictLiteral,
44 TT_ObjCForIn,
45 TT_ObjCMethodExpr,
46 TT_ObjCMethodSpecifier,
47 TT_ObjCProperty,
48 TT_ObjCSelectorName,
49 TT_OverloadedOperator,
50 TT_OverloadedOperatorLParen,
51 TT_PointerOrReference,
52 TT_PureVirtualSpecifier,
53 TT_RangeBasedForLoopColon,
54 TT_StartOfName,
55 TT_TemplateCloser,
56 TT_TemplateOpener,
57 TT_TrailingUnaryOperator,
58 TT_UnaryOperator,
59 TT_Unknown
60};
61
Daniel Jasperbac016b2012-12-03 18:12:45 +000062/// \brief A wrapper around a \c Token storing information about the
63/// whitespace characters preceeding it.
64struct FormatToken {
Manuel Klimeka080a182013-01-02 16:30:12 +000065 FormatToken()
Manuel Klimekde008c02013-05-27 15:23:34 +000066 : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0),
Manuel Klimekb3987012013-05-29 14:47:47 +000067 TokenLength(0), IsFirst(false), MustBreakBefore(false),
68 Type(TT_Unknown), SpacesRequiredBefore(0), CanBreakBefore(false),
69 ClosesTemplateDeclaration(false), ParameterCount(0), TotalLength(0),
70 UnbreakableTailLength(0), BindingStrength(0), SplitPenalty(0),
71 LongestObjCSelectorName(0), FakeRParens(0), LastInChainOfCalls(false),
72 PartOfMultiVariableDeclStmt(false), MatchingParen(NULL), Previous(NULL),
73 Next(NULL) {}
Daniel Jasperbac016b2012-12-03 18:12:45 +000074
75 /// \brief The \c Token.
76 Token Tok;
77
78 /// \brief The number of newlines immediately before the \c Token.
79 ///
80 /// This can be used to determine what the user wrote in the original code
81 /// and thereby e.g. leave an empty line between two function definitions.
82 unsigned NewlinesBefore;
83
Manuel Klimeka080a182013-01-02 16:30:12 +000084 /// \brief Whether there is at least one unescaped newline before the \c
85 /// Token.
86 bool HasUnescapedNewline;
87
Manuel Klimekad3094b2013-05-23 10:56:37 +000088 /// \brief The range of the whitespace immediately preceeding the \c Token.
89 SourceRange WhitespaceRange;
Manuel Klimekf6fd00b2013-01-05 22:56:06 +000090
Daniel Jasper1eee6c42013-03-04 13:43:19 +000091 /// \brief The offset just past the last '\n' in this token's leading
92 /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'.
93 unsigned LastNewlineOffset;
94
Manuel Klimek95419382013-01-07 07:56:50 +000095 /// \brief The length of the non-whitespace parts of the token. This is
96 /// necessary because we need to handle escaped newlines that are stored
97 /// with the token.
98 unsigned TokenLength;
99
Manuel Klimekf6fd00b2013-01-05 22:56:06 +0000100 /// \brief Indicates that this is the first token.
101 bool IsFirst;
Daniel Jasper26f7e782013-01-08 14:56:18 +0000102
Manuel Klimek526ed112013-01-09 15:25:02 +0000103 /// \brief Whether there must be a line break before this token.
104 ///
105 /// This happens for example when a preprocessor directive ended directly
106 /// before the token.
107 bool MustBreakBefore;
Alexander Kornienko919398b2013-04-17 17:34:05 +0000108
Alexander Kornienko919398b2013-04-17 17:34:05 +0000109 /// \brief Returns actual token start location without leading escaped
110 /// newlines and whitespace.
111 ///
112 /// This can be different to Tok.getLocation(), which includes leading escaped
113 /// newlines.
114 SourceLocation getStartOfNonWhitespace() const {
Manuel Klimekad3094b2013-05-23 10:56:37 +0000115 return WhitespaceRange.getEnd();
Alexander Kornienko919398b2013-04-17 17:34:05 +0000116 }
Manuel Klimekde008c02013-05-27 15:23:34 +0000117
118 /// \brief The raw text of the token.
119 ///
120 /// Contains the raw token text without leading whitespace and without leading
121 /// escaped newlines.
122 StringRef TokenText;
Manuel Klimekdcb3f2a2013-05-28 13:42:28 +0000123
Manuel Klimekb3987012013-05-29 14:47:47 +0000124 TokenType Type;
125
126 unsigned SpacesRequiredBefore;
127 bool CanBreakBefore;
128
129 bool ClosesTemplateDeclaration;
130
131 /// \brief Number of parameters, if this is "(", "[" or "<".
132 ///
133 /// This is initialized to 1 as we don't need to distinguish functions with
134 /// 0 parameters from functions with 1 parameter. Thus, we can simply count
135 /// the number of commas.
136 unsigned ParameterCount;
137
138 /// \brief The total length of the line up to and including this token.
139 unsigned TotalLength;
140
141 /// \brief The length of following tokens until the next natural split point,
142 /// or the next token that can be broken.
143 unsigned UnbreakableTailLength;
144
145 // FIXME: Come up with a 'cleaner' concept.
146 /// \brief The binding strength of a token. This is a combined value of
147 /// operator precedence, parenthesis nesting, etc.
148 unsigned BindingStrength;
149
150 /// \brief Penalty for inserting a line break before this token.
151 unsigned SplitPenalty;
152
153 /// \brief If this is the first ObjC selector name in an ObjC method
154 /// definition or call, this contains the length of the longest name.
155 unsigned LongestObjCSelectorName;
156
157 /// \brief Stores the number of required fake parentheses and the
158 /// corresponding operator precedence.
159 ///
160 /// If multiple fake parentheses start at a token, this vector stores them in
161 /// reverse order, i.e. inner fake parenthesis first.
162 SmallVector<prec::Level, 4> FakeLParens;
163 /// \brief Insert this many fake ) after this token for correct indentation.
164 unsigned FakeRParens;
165
166 /// \brief Is this the last "." or "->" in a builder-type call?
167 bool LastInChainOfCalls;
168
169 /// \brief Is this token part of a \c DeclStmt defining multiple variables?
170 ///
171 /// Only set if \c Type == \c TT_StartOfName.
172 bool PartOfMultiVariableDeclStmt;
173
174 bool is(tok::TokenKind Kind) const { return Tok.is(Kind); }
175
176 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const {
177 return is(K1) || is(K2);
178 }
179
180 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, tok::TokenKind K3) const {
181 return is(K1) || is(K2) || is(K3);
182 }
183
184 bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, tok::TokenKind K3,
185 tok::TokenKind K4, tok::TokenKind K5 = tok::NUM_TOKENS,
186 tok::TokenKind K6 = tok::NUM_TOKENS,
187 tok::TokenKind K7 = tok::NUM_TOKENS,
188 tok::TokenKind K8 = tok::NUM_TOKENS,
189 tok::TokenKind K9 = tok::NUM_TOKENS,
190 tok::TokenKind K10 = tok::NUM_TOKENS,
191 tok::TokenKind K11 = tok::NUM_TOKENS,
192 tok::TokenKind K12 = tok::NUM_TOKENS) const {
193 return is(K1) || is(K2) || is(K3) || is(K4) || is(K5) || is(K6) || is(K7) ||
194 is(K8) || is(K9) || is(K10) || is(K11) || is(K12);
195 }
196
197 bool isNot(tok::TokenKind Kind) const { return Tok.isNot(Kind); }
198
199 bool isObjCAtKeyword(tok::ObjCKeywordKind Kind) const {
200 return Tok.isObjCAtKeyword(Kind);
201 }
202
203 bool isAccessSpecifier(bool ColonRequired = true) const {
204 return isOneOf(tok::kw_public, tok::kw_protected, tok::kw_private) &&
205 (!ColonRequired || (Next && Next->is(tok::colon)));
206 }
207
208 bool isObjCAccessSpecifier() const {
209 return is(tok::at) && Next && (Next->isObjCAtKeyword(tok::objc_public) ||
210 Next->isObjCAtKeyword(tok::objc_protected) ||
211 Next->isObjCAtKeyword(tok::objc_package) ||
212 Next->isObjCAtKeyword(tok::objc_private));
213 }
214
215 /// \brief Returns whether \p Tok is ([{ or a template opening <.
216 bool opensScope() const {
217 return isOneOf(tok::l_paren, tok::l_brace, tok::l_square) ||
218 Type == TT_TemplateOpener;
219
220 }
221 /// \brief Returns whether \p Tok is )]} or a template opening >.
222 bool closesScope() const {
223 return isOneOf(tok::r_paren, tok::r_brace, tok::r_square) ||
224 Type == TT_TemplateCloser;
225 }
226
227 bool isUnaryOperator() const {
228 switch (Tok.getKind()) {
229 case tok::plus:
230 case tok::plusplus:
231 case tok::minus:
232 case tok::minusminus:
233 case tok::exclaim:
234 case tok::tilde:
235 case tok::kw_sizeof:
236 case tok::kw_alignof:
237 return true;
238 default:
239 return false;
240 }
241 }
242 bool isBinaryOperator() const {
243 // Comma is a binary operator, but does not behave as such wrt. formatting.
244 return getPrecedence() > prec::Comma;
245 }
246 bool isTrailingComment() const {
247 return is(tok::comment) && (!Next || Next->NewlinesBefore > 0);
248 }
249
250 prec::Level getPrecedence() const {
251 return getBinOpPrecedence(Tok.getKind(), true, true);
252 }
253
254 /// \brief Returns the previous token ignoring comments.
255 FormatToken *getPreviousNoneComment() const {
256 FormatToken *Tok = Previous;
257 while (Tok != NULL && Tok->is(tok::comment))
258 Tok = Tok->Previous;
259 return Tok;
260 }
261
262 /// \brief Returns the next token ignoring comments.
263 const FormatToken *getNextNoneComment() const {
264 const FormatToken *Tok = Next;
265 while (Tok != NULL && Tok->is(tok::comment))
266 Tok = Tok->Next;
267 return Tok;
268 }
269
270 FormatToken *MatchingParen;
271
272 FormatToken *Previous;
273 FormatToken *Next;
274
Manuel Klimekdcb3f2a2013-05-28 13:42:28 +0000275private:
276 // Disallow copying.
277 FormatToken(const FormatToken &);
278 void operator=(const FormatToken &);
Daniel Jasperbac016b2012-12-03 18:12:45 +0000279};
280
281/// \brief An unwrapped line is a sequence of \c Token, that we would like to
282/// put on a single line if there was no column limit.
283///
284/// This is used as a main interface between the \c UnwrappedLineParser and the
285/// \c UnwrappedLineFormatter. The key property is that changing the formatting
286/// within an unwrapped line does not affect any other unwrapped lines.
287struct UnwrappedLine {
Manuel Klimek70b03f42013-01-23 09:32:48 +0000288 UnwrappedLine() : Level(0), InPPDirective(false), MustBeDeclaration(false) {
Daniel Jasperbac016b2012-12-03 18:12:45 +0000289 }
290
Daniel Jasper3f8cdbf2013-01-16 10:41:46 +0000291 // FIXME: Don't use std::list here.
Daniel Jaspercbb6c412013-01-16 09:10:19 +0000292 /// \brief The \c Tokens comprising this \c UnwrappedLine.
Manuel Klimekdcb3f2a2013-05-28 13:42:28 +0000293 std::list<FormatToken *> Tokens;
Daniel Jasperbac016b2012-12-03 18:12:45 +0000294
295 /// \brief The indent level of the \c UnwrappedLine.
296 unsigned Level;
Manuel Klimeka080a182013-01-02 16:30:12 +0000297
298 /// \brief Whether this \c UnwrappedLine is part of a preprocessor directive.
299 bool InPPDirective;
Manuel Klimek70b03f42013-01-23 09:32:48 +0000300
301 bool MustBeDeclaration;
Daniel Jasperbac016b2012-12-03 18:12:45 +0000302};
303
304class UnwrappedLineConsumer {
305public:
Daniel Jasperaccb0b02012-12-04 21:05:31 +0000306 virtual ~UnwrappedLineConsumer() {
307 }
Alexander Kornienko720ffb62012-12-05 13:56:52 +0000308 virtual void consumeUnwrappedLine(const UnwrappedLine &Line) = 0;
Daniel Jasperbac016b2012-12-03 18:12:45 +0000309};
310
Manuel Klimek96e888b2013-05-28 11:55:06 +0000311class FormatTokenSource;
Alexander Kornienko469a21b2012-12-07 16:15:44 +0000312
Daniel Jasperbac016b2012-12-03 18:12:45 +0000313class UnwrappedLineParser {
314public:
Manuel Klimek96e888b2013-05-28 11:55:06 +0000315 UnwrappedLineParser(const FormatStyle &Style, ArrayRef<FormatToken *> Tokens,
Daniel Jasperbac016b2012-12-03 18:12:45 +0000316 UnwrappedLineConsumer &Callback);
317
Alexander Kornienkocff563c2012-12-04 17:27:50 +0000318 /// Returns true in case of a structural error.
319 bool parse();
Daniel Jasperbac016b2012-12-03 18:12:45 +0000320
321private:
Manuel Klimek67d080d2013-04-12 14:13:36 +0000322 void parseFile();
323 void parseLevel(bool HasOpeningBrace);
324 void parseBlock(bool MustBeDeclaration, unsigned AddLevels = 1);
Daniel Jasperbac016b2012-12-03 18:12:45 +0000325 void parsePPDirective();
Manuel Klimekd4397b92013-01-04 23:34:14 +0000326 void parsePPDefine();
Alexander Kornienko6fb46b02013-05-24 18:24:24 +0000327 void parsePPIf();
328 void parsePPIfdef();
329 void parsePPElIf();
330 void parsePPElse();
331 void parsePPEndIf();
Manuel Klimekd4397b92013-01-04 23:34:14 +0000332 void parsePPUnknown();
Manuel Klimekf0ab0a32013-01-07 14:56:16 +0000333 void parseStructuralElement();
Manuel Klimek80829bd2013-05-23 09:41:43 +0000334 bool tryToParseBracedList();
Manuel Klimekbb42bf12013-01-10 11:52:21 +0000335 void parseBracedList();
Manuel Klimekc44ee892013-01-21 10:07:49 +0000336 void parseReturn();
Daniel Jasperbac016b2012-12-03 18:12:45 +0000337 void parseParens();
338 void parseIfThenElse();
Alexander Kornienko2e97cfc2012-12-05 15:06:06 +0000339 void parseForOrWhileLoop();
Daniel Jasperbac016b2012-12-03 18:12:45 +0000340 void parseDoWhile();
341 void parseLabel();
342 void parseCaseLabel();
343 void parseSwitch();
Alexander Kornienko15757312012-12-06 18:03:27 +0000344 void parseNamespace();
Daniel Jasperbac016b2012-12-03 18:12:45 +0000345 void parseAccessSpecifier();
346 void parseEnum();
Manuel Klimek47ea7f62013-01-15 13:38:33 +0000347 void parseRecord();
Nico Weber1abe6ea2013-01-09 21:15:03 +0000348 void parseObjCProtocolList();
349 void parseObjCUntilAtEnd();
Nico Weber50767d82013-01-09 23:25:37 +0000350 void parseObjCInterfaceOrImplementation();
Nico Weber1abe6ea2013-01-09 21:15:03 +0000351 void parseObjCProtocol();
Daniel Jasperbac016b2012-12-03 18:12:45 +0000352 void addUnwrappedLine();
353 bool eof() const;
354 void nextToken();
Manuel Klimekd4397b92013-01-04 23:34:14 +0000355 void readToken();
Manuel Klimek86721d22013-01-22 16:31:55 +0000356 void flushComments(bool NewlineBeforeNext);
Manuel Klimek96e888b2013-05-28 11:55:06 +0000357 void pushToken(FormatToken *Tok);
Manuel Klimek80829bd2013-05-23 09:41:43 +0000358 void calculateBraceTypes();
Alexander Kornienko6fb46b02013-05-24 18:24:24 +0000359 void pushPPConditional();
Manuel Klimek80829bd2013-05-23 09:41:43 +0000360
361 // Represents what type of block a left brace opens.
362 enum LBraceState {
363 BS_Unknown,
364 BS_Block,
365 BS_BracedInit
366 };
Daniel Jasperbac016b2012-12-03 18:12:45 +0000367
Manuel Klimekc37b4d62013-01-05 22:14:16 +0000368 // FIXME: We are constantly running into bugs where Line.Level is incorrectly
369 // subtracted from beyond 0. Introduce a method to subtract from Line.Level
370 // and use that everywhere in the Parser.
Dmitri Gribenkocfa88f82013-01-12 19:30:44 +0000371 OwningPtr<UnwrappedLine> Line;
Manuel Klimek86721d22013-01-22 16:31:55 +0000372
373 // Comments are sorted into unwrapped lines by whether they are in the same
374 // line as the previous token, or not. If not, they belong to the next token.
375 // Since the next token might already be in a new unwrapped line, we need to
376 // store the comments belonging to that token.
Manuel Klimek96e888b2013-05-28 11:55:06 +0000377 SmallVector<FormatToken *, 1> CommentsBeforeNextToken;
378 FormatToken *FormatTok;
Manuel Klimek526ed112013-01-09 15:25:02 +0000379 bool MustBreakBeforeNextToken;
Daniel Jasperbac016b2012-12-03 18:12:45 +0000380
Manuel Klimekba287dc2013-01-18 18:24:28 +0000381 // The parsed lines. Only added to through \c CurrentLines.
Manuel Klimek525fe162013-01-18 14:04:34 +0000382 std::vector<UnwrappedLine> Lines;
383
384 // Preprocessor directives are parsed out-of-order from other unwrapped lines.
385 // Thus, we need to keep a list of preprocessor directives to be reported
386 // after an unwarpped line that has been started was finished.
387 std::vector<UnwrappedLine> PreprocessorDirectives;
388
389 // New unwrapped lines are added via CurrentLines.
390 // Usually points to \c &Lines. While parsing a preprocessor directive when
391 // there is an unfinished previous unwrapped line, will point to
392 // \c &PreprocessorDirectives.
393 std::vector<UnwrappedLine> *CurrentLines;
394
Manuel Klimek70b03f42013-01-23 09:32:48 +0000395 // We store for each line whether it must be a declaration depending on
396 // whether we are in a compound statement or not.
397 std::vector<bool> DeclarationScopeStack;
398
Manuel Klimek67d080d2013-04-12 14:13:36 +0000399 // Will be true if we encounter an error that leads to possibily incorrect
400 // indentation levels.
401 bool StructuralError;
402
Alexander Kornienko15757312012-12-06 18:03:27 +0000403 const FormatStyle &Style;
Manuel Klimekd4397b92013-01-04 23:34:14 +0000404 FormatTokenSource *Tokens;
Daniel Jasperbac016b2012-12-03 18:12:45 +0000405 UnwrappedLineConsumer &Callback;
Manuel Klimekbb42bf12013-01-10 11:52:21 +0000406
Manuel Klimek80829bd2013-05-23 09:41:43 +0000407 // FIXME: This is a temporary measure until we have reworked the ownership
408 // of the format tokens. The goal is to have the actual tokens created and
409 // owned outside of and handed into the UnwrappedLineParser.
Manuel Klimek96e888b2013-05-28 11:55:06 +0000410 ArrayRef<FormatToken *> AllTokens;
Manuel Klimek80829bd2013-05-23 09:41:43 +0000411
412 // FIXME: Currently we cannot store attributes with tokens, as we treat
413 // them as read-only; thus, we now store the brace state indexed by the
414 // position of the token in the stream (see \c AllTokens).
415 SmallVector<LBraceState, 16> LBraces;
416
Alexander Kornienko6fb46b02013-05-24 18:24:24 +0000417 // Represents preprocessor branch type, so we can find matching
418 // #if/#else/#endif directives.
419 enum PPBranchKind {
420 PP_Conditional, // Any #if, #ifdef, #ifndef, #elif, block outside #if 0
421 PP_Unreachable // #if 0 or a conditional preprocessor block inside #if 0
422 };
423
424 // Keeps a stack of currently active preprocessor branching directives.
425 SmallVector<PPBranchKind, 16> PPStack;
426
Manuel Klimekbb42bf12013-01-10 11:52:21 +0000427 friend class ScopedLineState;
Daniel Jasperbac016b2012-12-03 18:12:45 +0000428};
429
Daniel Jaspercd162382013-01-07 13:26:07 +0000430} // end namespace format
431} // end namespace clang
Daniel Jasperbac016b2012-12-03 18:12:45 +0000432
Daniel Jaspercd162382013-01-07 13:26:07 +0000433#endif // LLVM_CLANG_FORMAT_UNWRAPPED_LINE_PARSER_H