blob: c12daa7f20e9ec842bb870f611f090db78492fcf [file] [log] [blame]
Martin Probstc4a0dd42016-05-20 11:24:24 +00001//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Martin Probstc4a0dd42016-05-20 11:24:24 +00006//
7//===----------------------------------------------------------------------===//
8///
9/// \file
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000010/// This file implements FormatTokenLexer, which tokenizes a source file
Martin Probstc4a0dd42016-05-20 11:24:24 +000011/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
17#include "clang/Basic/SourceLocation.h"
18#include "clang/Basic/SourceManager.h"
19#include "clang/Format/Format.h"
20#include "llvm/Support/Regex.h"
21
22namespace clang {
23namespace format {
24
25FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
Krasimir Georgiev9ad83fe2017-10-30 14:01:50 +000026 unsigned Column, const FormatStyle &Style,
Martin Probstc4a0dd42016-05-20 11:24:24 +000027 encoding::Encoding Encoding)
Martin Probst6181da42016-08-25 10:13:21 +000028 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
Krasimir Georgiev9ad83fe2017-10-30 14:01:50 +000029 Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
Martin Probst6181da42016-08-25 10:13:21 +000030 Style(Style), IdentTable(getFormattingLangOpts(Style)),
31 Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
32 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
Martin Probstc4a0dd42016-05-20 11:24:24 +000033 MacroBlockEndRegex(Style.MacroBlockEnd) {
34 Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
35 getFormattingLangOpts(Style)));
36 Lex->SetKeepWhitespaceMode(true);
37
38 for (const std::string &ForEachMacro : Style.ForEachMacros)
Francois Ferrand6f40e212018-10-02 16:37:51 +000039 Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
40 for (const std::string &StatementMacro : Style.StatementMacros)
41 Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
Martin Probstc4a0dd42016-05-20 11:24:24 +000042}
43
44ArrayRef<FormatToken *> FormatTokenLexer::lex() {
45 assert(Tokens.empty());
46 assert(FirstInLineIndex == 0);
47 do {
48 Tokens.push_back(getNextToken());
49 if (Style.Language == FormatStyle::LK_JavaScript) {
50 tryParseJSRegexLiteral();
Martin Probst6181da42016-08-25 10:13:21 +000051 handleTemplateStrings();
Martin Probstc4a0dd42016-05-20 11:24:24 +000052 }
Krasimir Georgiev410ed242017-11-10 12:50:09 +000053 if (Style.Language == FormatStyle::LK_TextProto)
54 tryParsePythonComment();
Martin Probstc4a0dd42016-05-20 11:24:24 +000055 tryMergePreviousTokens();
56 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
57 FirstInLineIndex = Tokens.size() - 1;
58 } while (Tokens.back()->Tok.isNot(tok::eof));
59 return Tokens;
60}
61
62void FormatTokenLexer::tryMergePreviousTokens() {
63 if (tryMerge_TMacro())
64 return;
65 if (tryMergeConflictMarkers())
66 return;
67 if (tryMergeLessLess())
68 return;
Paul Hoadcbb726d2019-03-21 13:09:22 +000069
70 if (Style.isCSharp()) {
71 if (tryMergeCSharpKeywordVariables())
72 return;
73 if (tryMergeCSharpVerbatimStringLiteral())
74 return;
75 if (tryMergeCSharpDoubleQuestion())
76 return;
77 if (tryMergeCSharpNullConditionals())
78 return;
79 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
80 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
81 return;
82 }
83
Alexander Kornienkod4fa2e62017-04-11 09:55:00 +000084 if (tryMergeNSStringLiteral())
85 return;
Martin Probstc4a0dd42016-05-20 11:24:24 +000086
87 if (Style.Language == FormatStyle::LK_JavaScript) {
88 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
89 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
90 tok::equal};
91 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
92 tok::greaterequal};
93 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
Martin Probst4ef03702017-05-04 15:04:04 +000094 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
95 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
96 tok::starequal};
97
Martin Probstc4a0dd42016-05-20 11:24:24 +000098 // FIXME: Investigate what token type gives the correct operator priority.
99 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
100 return;
101 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
102 return;
103 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
104 return;
105 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
106 return;
Martin Probst4ef03702017-05-04 15:04:04 +0000107 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
108 return;
109 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
110 Tokens.back()->Tok.setKind(tok::starequal);
111 return;
112 }
Martin Probst26a484f42019-03-19 12:28:41 +0000113 if (tryMergeJSPrivateIdentifier())
114 return;
Martin Probstc4a0dd42016-05-20 11:24:24 +0000115 }
Nico Weber48c94a62017-04-11 15:50:04 +0000116
117 if (Style.Language == FormatStyle::LK_Java) {
Manuel Klimek89628f62017-09-20 09:51:03 +0000118 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
119 tok::greater, tok::greater, tok::greaterequal};
Nico Weber48c94a62017-04-11 15:50:04 +0000120 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
121 return;
122 }
Martin Probstc4a0dd42016-05-20 11:24:24 +0000123}
124
Alexander Kornienkod4fa2e62017-04-11 09:55:00 +0000125bool FormatTokenLexer::tryMergeNSStringLiteral() {
126 if (Tokens.size() < 2)
127 return false;
128 auto &At = *(Tokens.end() - 2);
129 auto &String = *(Tokens.end() - 1);
130 if (!At->is(tok::at) || !String->is(tok::string_literal))
131 return false;
132 At->Tok.setKind(tok::string_literal);
133 At->TokenText = StringRef(At->TokenText.begin(),
134 String->TokenText.end() - At->TokenText.begin());
135 At->ColumnWidth += String->ColumnWidth;
136 At->Type = TT_ObjCStringLiteral;
137 Tokens.erase(Tokens.end() - 1);
138 return true;
139}
140
Martin Probst26a484f42019-03-19 12:28:41 +0000141bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
142 // Merges #idenfier into a single identifier with the text #identifier
143 // but the token tok::identifier.
144 if (Tokens.size() < 2)
145 return false;
146 auto &Hash = *(Tokens.end() - 2);
147 auto &Identifier = *(Tokens.end() - 1);
148 if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
149 return false;
150 Hash->Tok.setKind(tok::identifier);
151 Hash->TokenText =
152 StringRef(Hash->TokenText.begin(),
153 Identifier->TokenText.end() - Hash->TokenText.begin());
154 Hash->ColumnWidth += Identifier->ColumnWidth;
155 Hash->Type = TT_JsPrivateIdentifier;
156 Tokens.erase(Tokens.end() - 1);
157 return true;
158}
159
Paul Hoadcbb726d2019-03-21 13:09:22 +0000160// Search for verbatim or interpolated string literals @"ABC" or
161// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
162// prevent splitting of @, $ and ".
163bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
164 if (Tokens.size() < 2)
165 return false;
166 auto &At = *(Tokens.end() - 2);
167 auto &String = *(Tokens.end() - 1);
168
169 // Look for $"aaaaaa" @"aaaaaa".
170 if (!(At->is(tok::at) || At->TokenText == "$") ||
171 !String->is(tok::string_literal))
172 return false;
173
174 if (Tokens.size() >= 2 && At->is(tok::at)) {
175 auto &Dollar = *(Tokens.end() - 3);
176 if (Dollar->TokenText == "$") {
177 // This looks like $@"aaaaa" so we need to combine all 3 tokens.
178 Dollar->Tok.setKind(tok::string_literal);
179 Dollar->TokenText =
180 StringRef(Dollar->TokenText.begin(),
181 String->TokenText.end() - Dollar->TokenText.begin());
182 Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
183 Dollar->Type = TT_CSharpStringLiteral;
184 Tokens.erase(Tokens.end() - 2);
185 Tokens.erase(Tokens.end() - 1);
186 return true;
187 }
188 }
189
190 // Convert back into just a string_literal.
191 At->Tok.setKind(tok::string_literal);
192 At->TokenText = StringRef(At->TokenText.begin(),
193 String->TokenText.end() - At->TokenText.begin());
194 At->ColumnWidth += String->ColumnWidth;
195 At->Type = TT_CSharpStringLiteral;
196 Tokens.erase(Tokens.end() - 1);
197 return true;
198}
199
200bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
201 if (Tokens.size() < 2)
202 return false;
203 auto &FirstQuestion = *(Tokens.end() - 2);
204 auto &SecondQuestion = *(Tokens.end() - 1);
205 if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
206 return false;
207 FirstQuestion->Tok.setKind(tok::question);
208 FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
209 SecondQuestion->TokenText.end() -
210 FirstQuestion->TokenText.begin());
211 FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
212 FirstQuestion->Type = TT_CSharpNullCoalescing;
213 Tokens.erase(Tokens.end() - 1);
214 return true;
215}
216
217bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
218 if (Tokens.size() < 2)
219 return false;
220 auto &At = *(Tokens.end() - 2);
221 auto &Keyword = *(Tokens.end() - 1);
222 if (!At->is(tok::at))
223 return false;
224 if (!Keywords.isCSharpKeyword(*Keyword))
225 return false;
226
227 At->Tok.setKind(tok::identifier);
228 At->TokenText = StringRef(At->TokenText.begin(),
229 Keyword->TokenText.end() - At->TokenText.begin());
230 At->ColumnWidth += Keyword->ColumnWidth;
231 At->Type = Keyword->Type;
232 Tokens.erase(Tokens.end() - 1);
233 return true;
234}
235
236// In C# merge the Identifier and the ? together e.g. arg?.
237bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
238 if (Tokens.size() < 2)
239 return false;
240 auto &Identifier = *(Tokens.end() - 2);
241 auto &Question = *(Tokens.end() - 1);
242 if (!Identifier->isOneOf(tok::r_square, tok::identifier) ||
243 !Question->is(tok::question))
244 return false;
245 Identifier->TokenText =
246 StringRef(Identifier->TokenText.begin(),
247 Question->TokenText.end() - Identifier->TokenText.begin());
248 Identifier->ColumnWidth += Question->ColumnWidth;
Paul Hoadcbb726d2019-03-21 13:09:22 +0000249 Tokens.erase(Tokens.end() - 1);
250 return true;
251}
252
Martin Probstc4a0dd42016-05-20 11:24:24 +0000253bool FormatTokenLexer::tryMergeLessLess() {
254 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
255 if (Tokens.size() < 3)
256 return false;
257
258 bool FourthTokenIsLess = false;
259 if (Tokens.size() > 3)
260 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
261
262 auto First = Tokens.end() - 3;
263 if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
264 First[0]->isNot(tok::less) || FourthTokenIsLess)
265 return false;
266
267 // Only merge if there currently is no whitespace between the two "<".
268 if (First[1]->WhitespaceRange.getBegin() !=
269 First[1]->WhitespaceRange.getEnd())
270 return false;
271
272 First[0]->Tok.setKind(tok::lessless);
273 First[0]->TokenText = "<<";
274 First[0]->ColumnWidth += 1;
275 Tokens.erase(Tokens.end() - 2);
276 return true;
277}
278
279bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
280 TokenType NewType) {
281 if (Tokens.size() < Kinds.size())
282 return false;
283
284 SmallVectorImpl<FormatToken *>::const_iterator First =
285 Tokens.end() - Kinds.size();
286 if (!First[0]->is(Kinds[0]))
287 return false;
288 unsigned AddLength = 0;
289 for (unsigned i = 1; i < Kinds.size(); ++i) {
Manuel Klimek89628f62017-09-20 09:51:03 +0000290 if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
291 First[i]->WhitespaceRange.getEnd())
Martin Probstc4a0dd42016-05-20 11:24:24 +0000292 return false;
293 AddLength += First[i]->TokenText.size();
294 }
295 Tokens.resize(Tokens.size() - Kinds.size() + 1);
296 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
297 First[0]->TokenText.size() + AddLength);
298 First[0]->ColumnWidth += AddLength;
299 First[0]->Type = NewType;
300 return true;
301}
302
303// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
304bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
305 // NB: This is not entirely correct, as an r_paren can introduce an operand
306 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
307 // corner case to not matter in practice, though.
308 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
309 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
310 tok::colon, tok::question, tok::tilde) ||
311 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
312 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
313 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
314 Tok->isBinaryOperator();
315}
316
317bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
318 if (!Prev)
319 return true;
320
321 // Regex literals can only follow after prefix unary operators, not after
322 // postfix unary operators. If the '++' is followed by a non-operand
323 // introducing token, the slash here is the operand and not the start of a
324 // regex.
Martin Probst16282992017-02-07 14:08:03 +0000325 // `!` is an unary prefix operator, but also a post-fix operator that casts
326 // away nullability, so the same check applies.
327 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
Martin Probstc4a0dd42016-05-20 11:24:24 +0000328 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
329
330 // The previous token must introduce an operand location where regex
331 // literals can occur.
332 if (!precedesOperand(Prev))
333 return false;
334
335 return true;
336}
337
338// Tries to parse a JavaScript Regex literal starting at the current token,
339// if that begins with a slash and is in a location where JavaScript allows
340// regex literals. Changes the current token to a regex literal and updates
341// its text if successful.
342void FormatTokenLexer::tryParseJSRegexLiteral() {
343 FormatToken *RegexToken = Tokens.back();
344 if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
345 return;
346
347 FormatToken *Prev = nullptr;
348 for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
349 // NB: Because previous pointers are not initialized yet, this cannot use
350 // Token.getPreviousNonComment.
351 if ((*I)->isNot(tok::comment)) {
352 Prev = *I;
353 break;
354 }
355 }
356
357 if (!canPrecedeRegexLiteral(Prev))
358 return;
359
360 // 'Manually' lex ahead in the current file buffer.
361 const char *Offset = Lex->getBufferLocation();
362 const char *RegexBegin = Offset - RegexToken->TokenText.size();
363 StringRef Buffer = Lex->getBuffer();
364 bool InCharacterClass = false;
365 bool HaveClosingSlash = false;
366 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
367 // Regular expressions are terminated with a '/', which can only be
368 // escaped using '\' or a character class between '[' and ']'.
369 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
370 switch (*Offset) {
371 case '\\':
372 // Skip the escaped character.
373 ++Offset;
374 break;
375 case '[':
376 InCharacterClass = true;
377 break;
378 case ']':
379 InCharacterClass = false;
380 break;
381 case '/':
382 if (!InCharacterClass)
383 HaveClosingSlash = true;
384 break;
385 }
386 }
387
388 RegexToken->Type = TT_RegexLiteral;
389 // Treat regex literals like other string_literals.
390 RegexToken->Tok.setKind(tok::string_literal);
391 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
392 RegexToken->ColumnWidth = RegexToken->TokenText.size();
393
394 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
395}
396
Martin Probst6181da42016-08-25 10:13:21 +0000397void FormatTokenLexer::handleTemplateStrings() {
Martin Probstc4a0dd42016-05-20 11:24:24 +0000398 FormatToken *BacktickToken = Tokens.back();
Martin Probst6181da42016-08-25 10:13:21 +0000399
400 if (BacktickToken->is(tok::l_brace)) {
401 StateStack.push(LexerState::NORMAL);
Martin Probstc4a0dd42016-05-20 11:24:24 +0000402 return;
Martin Probst6181da42016-08-25 10:13:21 +0000403 }
404 if (BacktickToken->is(tok::r_brace)) {
Daniel Jasper58209dd2016-09-17 07:20:36 +0000405 if (StateStack.size() == 1)
406 return;
Martin Probst6181da42016-08-25 10:13:21 +0000407 StateStack.pop();
408 if (StateStack.top() != LexerState::TEMPLATE_STRING)
409 return;
410 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
411 } else if (BacktickToken->is(tok::unknown) &&
412 BacktickToken->TokenText == "`") {
413 StateStack.push(LexerState::TEMPLATE_STRING);
414 } else {
415 return; // Not actually a template
416 }
Martin Probstc4a0dd42016-05-20 11:24:24 +0000417
418 // 'Manually' lex ahead in the current file buffer.
419 const char *Offset = Lex->getBufferLocation();
420 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
Martin Probst6181da42016-08-25 10:13:21 +0000421 for (; Offset != Lex->getBuffer().end(); ++Offset) {
422 if (Offset[0] == '`') {
423 StateStack.pop();
424 break;
425 }
426 if (Offset[0] == '\\') {
Martin Probstc4a0dd42016-05-20 11:24:24 +0000427 ++Offset; // Skip the escaped character.
Martin Probst6181da42016-08-25 10:13:21 +0000428 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
429 Offset[1] == '{') {
430 // '${' introduces an expression interpolation in the template string.
431 StateStack.push(LexerState::NORMAL);
432 ++Offset;
433 break;
434 }
Martin Probstc4a0dd42016-05-20 11:24:24 +0000435 }
436
437 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
438 BacktickToken->Type = TT_TemplateString;
439 BacktickToken->Tok.setKind(tok::string_literal);
440 BacktickToken->TokenText = LiteralText;
441
442 // Adjust width for potentially multiline string literals.
443 size_t FirstBreak = LiteralText.find('\n');
444 StringRef FirstLineText = FirstBreak == StringRef::npos
445 ? LiteralText
446 : LiteralText.substr(0, FirstBreak);
447 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
448 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
449 size_t LastBreak = LiteralText.rfind('\n');
450 if (LastBreak != StringRef::npos) {
451 BacktickToken->IsMultiline = true;
452 unsigned StartColumn = 0; // The template tail spans the entire line.
453 BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
454 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
455 Style.TabWidth, Encoding);
456 }
457
Martin Probst6181da42016-08-25 10:13:21 +0000458 SourceLocation loc = Offset < Lex->getBuffer().end()
459 ? Lex->getSourceLocation(Offset + 1)
460 : SourceMgr.getLocForEndOfFile(ID);
461 resetLexer(SourceMgr.getFileOffset(loc));
Martin Probstc4a0dd42016-05-20 11:24:24 +0000462}
463
Krasimir Georgiev410ed242017-11-10 12:50:09 +0000464void FormatTokenLexer::tryParsePythonComment() {
465 FormatToken *HashToken = Tokens.back();
Krasimir Georgiev45dde412018-06-07 09:46:24 +0000466 if (!HashToken->isOneOf(tok::hash, tok::hashhash))
Krasimir Georgiev410ed242017-11-10 12:50:09 +0000467 return;
468 // Turn the remainder of this line into a comment.
469 const char *CommentBegin =
470 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
471 size_t From = CommentBegin - Lex->getBuffer().begin();
472 size_t To = Lex->getBuffer().find_first_of('\n', From);
473 if (To == StringRef::npos)
474 To = Lex->getBuffer().size();
475 size_t Len = To - From;
476 HashToken->Type = TT_LineComment;
477 HashToken->Tok.setKind(tok::comment);
478 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
479 SourceLocation Loc = To < Lex->getBuffer().size()
480 ? Lex->getSourceLocation(CommentBegin + Len)
481 : SourceMgr.getLocForEndOfFile(ID);
482 resetLexer(SourceMgr.getFileOffset(Loc));
483}
484
Martin Probstc4a0dd42016-05-20 11:24:24 +0000485bool FormatTokenLexer::tryMerge_TMacro() {
486 if (Tokens.size() < 4)
487 return false;
488 FormatToken *Last = Tokens.back();
489 if (!Last->is(tok::r_paren))
490 return false;
491
492 FormatToken *String = Tokens[Tokens.size() - 2];
493 if (!String->is(tok::string_literal) || String->IsMultiline)
494 return false;
495
496 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
497 return false;
498
499 FormatToken *Macro = Tokens[Tokens.size() - 4];
500 if (Macro->TokenText != "_T")
501 return false;
502
503 const char *Start = Macro->TokenText.data();
504 const char *End = Last->TokenText.data() + Last->TokenText.size();
505 String->TokenText = StringRef(Start, End - Start);
506 String->IsFirst = Macro->IsFirst;
507 String->LastNewlineOffset = Macro->LastNewlineOffset;
508 String->WhitespaceRange = Macro->WhitespaceRange;
509 String->OriginalColumn = Macro->OriginalColumn;
510 String->ColumnWidth = encoding::columnWidthWithTabs(
511 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
512 String->NewlinesBefore = Macro->NewlinesBefore;
513 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
514
515 Tokens.pop_back();
516 Tokens.pop_back();
517 Tokens.pop_back();
518 Tokens.back() = String;
519 return true;
520}
521
522bool FormatTokenLexer::tryMergeConflictMarkers() {
523 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
524 return false;
525
526 // Conflict lines look like:
527 // <marker> <text from the vcs>
528 // For example:
529 // >>>>>>> /file/in/file/system at revision 1234
530 //
531 // We merge all tokens in a line that starts with a conflict marker
532 // into a single token with a special token type that the unwrapped line
533 // parser will use to correctly rebuild the underlying code.
534
535 FileID ID;
536 // Get the position of the first token in the line.
537 unsigned FirstInLineOffset;
538 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
539 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
540 StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
541 // Calculate the offset of the start of the current line.
542 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
543 if (LineOffset == StringRef::npos) {
544 LineOffset = 0;
545 } else {
546 ++LineOffset;
547 }
548
549 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
550 StringRef LineStart;
551 if (FirstSpace == StringRef::npos) {
552 LineStart = Buffer.substr(LineOffset);
553 } else {
554 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
555 }
556
557 TokenType Type = TT_Unknown;
558 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
559 Type = TT_ConflictStart;
560 } else if (LineStart == "|||||||" || LineStart == "=======" ||
561 LineStart == "====") {
562 Type = TT_ConflictAlternative;
563 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
564 Type = TT_ConflictEnd;
565 }
566
567 if (Type != TT_Unknown) {
568 FormatToken *Next = Tokens.back();
569
570 Tokens.resize(FirstInLineIndex + 1);
571 // We do not need to build a complete token here, as we will skip it
572 // during parsing anyway (as we must not touch whitespace around conflict
573 // markers).
574 Tokens.back()->Type = Type;
575 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
576
577 Tokens.push_back(Next);
578 return true;
579 }
580
581 return false;
582}
583
584FormatToken *FormatTokenLexer::getStashedToken() {
585 // Create a synthesized second '>' or '<' token.
586 Token Tok = FormatTok->Tok;
587 StringRef TokenText = FormatTok->TokenText;
588
589 unsigned OriginalColumn = FormatTok->OriginalColumn;
590 FormatTok = new (Allocator.Allocate()) FormatToken;
591 FormatTok->Tok = Tok;
592 SourceLocation TokLocation =
593 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
594 FormatTok->Tok.setLocation(TokLocation);
595 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
596 FormatTok->TokenText = TokenText;
597 FormatTok->ColumnWidth = 1;
598 FormatTok->OriginalColumn = OriginalColumn + 1;
599
600 return FormatTok;
601}
602
603FormatToken *FormatTokenLexer::getNextToken() {
Martin Probst6181da42016-08-25 10:13:21 +0000604 if (StateStack.top() == LexerState::TOKEN_STASHED) {
605 StateStack.pop();
Martin Probstc4a0dd42016-05-20 11:24:24 +0000606 return getStashedToken();
607 }
608
609 FormatTok = new (Allocator.Allocate()) FormatToken;
610 readRawToken(*FormatTok);
611 SourceLocation WhitespaceStart =
612 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
613 FormatTok->IsFirst = IsFirstToken;
614 IsFirstToken = false;
615
616 // Consume and record whitespace until we find a significant token.
617 unsigned WhitespaceLength = TrailingWhitespace;
618 while (FormatTok->Tok.is(tok::unknown)) {
619 StringRef Text = FormatTok->TokenText;
620 auto EscapesNewline = [&](int pos) {
621 // A '\r' here is just part of '\r\n'. Skip it.
622 if (pos >= 0 && Text[pos] == '\r')
623 --pos;
624 // See whether there is an odd number of '\' before this.
Richard Smith1d2ae942017-04-17 23:44:51 +0000625 // FIXME: This is wrong. A '\' followed by a newline is always removed,
626 // regardless of whether there is another '\' before it.
627 // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
Martin Probstc4a0dd42016-05-20 11:24:24 +0000628 unsigned count = 0;
629 for (; pos >= 0; --pos, ++count)
630 if (Text[pos] != '\\')
631 break;
632 return count & 1;
633 };
634 // FIXME: This miscounts tok:unknown tokens that are not just
635 // whitespace, e.g. a '`' character.
636 for (int i = 0, e = Text.size(); i != e; ++i) {
637 switch (Text[i]) {
638 case '\n':
639 ++FormatTok->NewlinesBefore;
640 FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
641 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
642 Column = 0;
643 break;
644 case '\r':
645 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
646 Column = 0;
647 break;
648 case '\f':
649 case '\v':
650 Column = 0;
651 break;
652 case ' ':
653 ++Column;
654 break;
655 case '\t':
656 Column += Style.TabWidth - Column % Style.TabWidth;
657 break;
658 case '\\':
659 if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
660 FormatTok->Type = TT_ImplicitStringLiteral;
661 break;
662 default:
663 FormatTok->Type = TT_ImplicitStringLiteral;
664 break;
665 }
666 if (FormatTok->Type == TT_ImplicitStringLiteral)
667 break;
668 }
669
670 if (FormatTok->is(TT_ImplicitStringLiteral))
671 break;
672 WhitespaceLength += FormatTok->Tok.getLength();
673
674 readRawToken(*FormatTok);
675 }
676
Martin Probst64d31ed2017-08-08 14:52:42 +0000677 // JavaScript and Java do not allow to escape the end of the line with a
678 // backslash. Backslashes are syntax errors in plain source, but can occur in
679 // comments. When a single line comment ends with a \, it'll cause the next
680 // line of code to be lexed as a comment, breaking formatting. The code below
681 // finds comments that contain a backslash followed by a line break, truncates
682 // the comment token at the backslash, and resets the lexer to restart behind
683 // the backslash.
684 if ((Style.Language == FormatStyle::LK_JavaScript ||
685 Style.Language == FormatStyle::LK_Java) &&
686 FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
687 size_t BackslashPos = FormatTok->TokenText.find('\\');
688 while (BackslashPos != StringRef::npos) {
689 if (BackslashPos + 1 < FormatTok->TokenText.size() &&
690 FormatTok->TokenText[BackslashPos + 1] == '\n') {
691 const char *Offset = Lex->getBufferLocation();
692 Offset -= FormatTok->TokenText.size();
693 Offset += BackslashPos + 1;
694 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
695 FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
696 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
697 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
698 Encoding);
699 break;
700 }
701 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
702 }
703 }
704
Martin Probstc4a0dd42016-05-20 11:24:24 +0000705 // In case the token starts with escaped newlines, we want to
706 // take them into account as whitespace - this pattern is quite frequent
707 // in macro definitions.
708 // FIXME: Add a more explicit test.
Krasimir Georgiev7f64fa82017-10-30 14:41:34 +0000709 while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
710 unsigned SkippedWhitespace = 0;
711 if (FormatTok->TokenText.size() > 2 &&
712 (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
713 SkippedWhitespace = 3;
714 else if (FormatTok->TokenText[1] == '\n')
715 SkippedWhitespace = 2;
716 else
717 break;
718
Martin Probstc4a0dd42016-05-20 11:24:24 +0000719 ++FormatTok->NewlinesBefore;
Krasimir Georgiev7f64fa82017-10-30 14:41:34 +0000720 WhitespaceLength += SkippedWhitespace;
721 FormatTok->LastNewlineOffset = SkippedWhitespace;
Martin Probstc4a0dd42016-05-20 11:24:24 +0000722 Column = 0;
Krasimir Georgiev7f64fa82017-10-30 14:41:34 +0000723 FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
Martin Probstc4a0dd42016-05-20 11:24:24 +0000724 }
725
726 FormatTok->WhitespaceRange = SourceRange(
727 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
728
729 FormatTok->OriginalColumn = Column;
730
731 TrailingWhitespace = 0;
732 if (FormatTok->Tok.is(tok::comment)) {
733 // FIXME: Add the trimmed whitespace to Column.
734 StringRef UntrimmedText = FormatTok->TokenText;
735 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
736 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
737 } else if (FormatTok->Tok.is(tok::raw_identifier)) {
738 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
739 FormatTok->Tok.setIdentifierInfo(&Info);
740 FormatTok->Tok.setKind(Info.getTokenID());
741 if (Style.Language == FormatStyle::LK_Java &&
742 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
743 tok::kw_operator)) {
744 FormatTok->Tok.setKind(tok::identifier);
745 FormatTok->Tok.setIdentifierInfo(nullptr);
746 } else if (Style.Language == FormatStyle::LK_JavaScript &&
747 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
748 tok::kw_operator)) {
749 FormatTok->Tok.setKind(tok::identifier);
750 FormatTok->Tok.setIdentifierInfo(nullptr);
751 }
752 } else if (FormatTok->Tok.is(tok::greatergreater)) {
753 FormatTok->Tok.setKind(tok::greater);
754 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
Malcolm Parsons6af3f142016-11-03 16:57:30 +0000755 ++Column;
Martin Probst6181da42016-08-25 10:13:21 +0000756 StateStack.push(LexerState::TOKEN_STASHED);
Martin Probstc4a0dd42016-05-20 11:24:24 +0000757 } else if (FormatTok->Tok.is(tok::lessless)) {
758 FormatTok->Tok.setKind(tok::less);
759 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
Malcolm Parsons6af3f142016-11-03 16:57:30 +0000760 ++Column;
Martin Probst6181da42016-08-25 10:13:21 +0000761 StateStack.push(LexerState::TOKEN_STASHED);
Martin Probstc4a0dd42016-05-20 11:24:24 +0000762 }
763
764 // Now FormatTok is the next non-whitespace token.
765
766 StringRef Text = FormatTok->TokenText;
767 size_t FirstNewlinePos = Text.find('\n');
768 if (FirstNewlinePos == StringRef::npos) {
769 // FIXME: ColumnWidth actually depends on the start column, we need to
770 // take this into account when the token is moved.
771 FormatTok->ColumnWidth =
772 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
773 Column += FormatTok->ColumnWidth;
774 } else {
775 FormatTok->IsMultiline = true;
776 // FIXME: ColumnWidth actually depends on the start column, we need to
777 // take this into account when the token is moved.
778 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
779 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
780
781 // The last line of the token always starts in column 0.
782 // Thus, the length can be precomputed even in the presence of tabs.
783 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
784 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
785 Column = FormatTok->LastLineColumnWidth;
786 }
787
Daniel Jasper1dbc2102017-03-31 13:30:24 +0000788 if (Style.isCpp()) {
Francois Ferrand6f40e212018-10-02 16:37:51 +0000789 auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
Martin Probstc4a0dd42016-05-20 11:24:24 +0000790 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
791 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
792 tok::pp_define) &&
Francois Ferrand6f40e212018-10-02 16:37:51 +0000793 it != Macros.end()) {
794 FormatTok->Type = it->second;
Martin Probstc4a0dd42016-05-20 11:24:24 +0000795 } else if (FormatTok->is(tok::identifier)) {
796 if (MacroBlockBeginRegex.match(Text)) {
797 FormatTok->Type = TT_MacroBlockBegin;
798 } else if (MacroBlockEndRegex.match(Text)) {
799 FormatTok->Type = TT_MacroBlockEnd;
800 }
801 }
802 }
803
804 return FormatTok;
805}
806
807void FormatTokenLexer::readRawToken(FormatToken &Tok) {
808 Lex->LexFromRawLexer(Tok.Tok);
809 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
810 Tok.Tok.getLength());
811 // For formatting, treat unterminated string literals like normal string
812 // literals.
813 if (Tok.is(tok::unknown)) {
814 if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
815 Tok.Tok.setKind(tok::string_literal);
816 Tok.IsUnterminatedLiteral = true;
817 } else if (Style.Language == FormatStyle::LK_JavaScript &&
818 Tok.TokenText == "''") {
819 Tok.Tok.setKind(tok::string_literal);
820 }
821 }
822
Daniel Jasper9c95dfe2018-03-12 10:32:18 +0000823 if ((Style.Language == FormatStyle::LK_JavaScript ||
824 Style.Language == FormatStyle::LK_Proto ||
825 Style.Language == FormatStyle::LK_TextProto) &&
Martin Probstc4a0dd42016-05-20 11:24:24 +0000826 Tok.is(tok::char_constant)) {
827 Tok.Tok.setKind(tok::string_literal);
828 }
829
830 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
831 Tok.TokenText == "/* clang-format on */")) {
832 FormattingDisabled = false;
833 }
834
835 Tok.Finalized = FormattingDisabled;
836
837 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
838 Tok.TokenText == "/* clang-format off */")) {
839 FormattingDisabled = true;
840 }
841}
842
843void FormatTokenLexer::resetLexer(unsigned Offset) {
844 StringRef Buffer = SourceMgr.getBufferData(ID);
845 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
846 getFormattingLangOpts(Style), Buffer.begin(),
847 Buffer.begin() + Offset, Buffer.end()));
848 Lex->SetKeepWhitespaceMode(true);
849 TrailingWhitespace = 0;
850}
851
852} // namespace format
853} // namespace clang