blob: 38b331b0a1c3b141db2188bb4dc93caf88de8af6 [file] [log] [blame]
Yitzhak Mandelbaum84f22712019-04-05 14:05:03 +00001//===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file provides functions that simplify extraction of source code.
10//
11//===----------------------------------------------------------------------===//
Yitzhak Mandelbaumfbdf8352019-10-10 02:34:47 +000012#include "clang/Tooling/Transformer/SourceCode.h"
Yitzhak Mandelbaum38b45162020-02-26 08:14:19 -050013#include "clang/AST/ASTContext.h"
14#include "clang/AST/Attr.h"
15#include "clang/AST/Comment.h"
16#include "clang/AST/Decl.h"
17#include "clang/AST/DeclCXX.h"
18#include "clang/AST/DeclTemplate.h"
19#include "clang/AST/Expr.h"
Reid Kleckner86565c12020-02-27 11:01:58 -080020#include "clang/Basic/SourceManager.h"
Yitzhak Mandelbaum84f22712019-04-05 14:05:03 +000021#include "clang/Lex/Lexer.h"
Yitzhak Mandelbaumb9d2bf32020-01-06 11:00:44 -050022#include "llvm/Support/Errc.h"
Yitzhak Mandelbaum84f22712019-04-05 14:05:03 +000023
24using namespace clang;
25
Yitzhak Mandelbaumb9d2bf32020-01-06 11:00:44 -050026using llvm::errc;
27using llvm::StringError;
28
Yitzhak Mandelbaum84f22712019-04-05 14:05:03 +000029StringRef clang::tooling::getText(CharSourceRange Range,
30 const ASTContext &Context) {
31 return Lexer::getSourceText(Range, Context.getSourceManager(),
32 Context.getLangOpts());
33}
34
35CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,
36 tok::TokenKind Next,
37 ASTContext &Context) {
38 Optional<Token> Tok = Lexer::findNextToken(
39 Range.getEnd(), Context.getSourceManager(), Context.getLangOpts());
40 if (!Tok || !Tok->is(Next))
41 return Range;
42 return CharSourceRange::getTokenRange(Range.getBegin(), Tok->getLocation());
43}
Yitzhak Mandelbaum2e97a1e2019-07-18 17:26:57 +000044
Yitzhak Mandelbaumb9d2bf32020-01-06 11:00:44 -050045llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,
46 const SourceManager &SM) {
47 if (Range.isInvalid())
48 return llvm::make_error<StringError>(errc::invalid_argument,
49 "Invalid range");
50
51 if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
52 return llvm::make_error<StringError>(
53 errc::invalid_argument, "Range starts or ends in a macro expansion");
54
55 if (SM.isInSystemHeader(Range.getBegin()) ||
56 SM.isInSystemHeader(Range.getEnd()))
57 return llvm::make_error<StringError>(errc::invalid_argument,
58 "Range is in system header");
59
60 std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
61 std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
62 if (BeginInfo.first != EndInfo.first)
63 return llvm::make_error<StringError>(
64 errc::invalid_argument, "Range begins and ends in different files");
65
66 if (BeginInfo.second > EndInfo.second)
67 return llvm::make_error<StringError>(
68 errc::invalid_argument, "Range's begin is past its end");
69
70 return llvm::Error::success();
71}
72
Yitzhak Mandelbaum2e97a1e2019-07-18 17:26:57 +000073llvm::Optional<CharSourceRange>
74clang::tooling::getRangeForEdit(const CharSourceRange &EditRange,
75 const SourceManager &SM,
76 const LangOptions &LangOpts) {
77 // FIXME: makeFileCharRange() has the disadvantage of stripping off "identity"
78 // macros. For example, if we're looking to rewrite the int literal 3 to 6,
79 // and we have the following definition:
80 // #define DO_NOTHING(x) x
81 // then
82 // foo(DO_NOTHING(3))
83 // will be rewritten to
84 // foo(6)
85 // rather than the arguably better
86 // foo(DO_NOTHING(6))
87 // Decide whether the current behavior is desirable and modify if not.
88 CharSourceRange Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
Yitzhak Mandelbaumb9d2bf32020-01-06 11:00:44 -050089 bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
90 if (IsInvalid)
91 return llvm::None;
Yitzhak Mandelbaum2e97a1e2019-07-18 17:26:57 +000092 return Range;
Yitzhak Mandelbaumb9d2bf32020-01-06 11:00:44 -050093
Yitzhak Mandelbaum2e97a1e2019-07-18 17:26:57 +000094}
Yitzhak Mandelbaum38b45162020-02-26 08:14:19 -050095
96static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
97 return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
98}
99
100static bool contains(const std::set<tok::TokenKind> &Terminators,
101 const Token &Tok) {
102 return Terminators.count(Tok.getKind()) > 0;
103}
104
105// Returns the exclusive, *file* end location of the entity whose last token is
106// at location 'EntityLast'. That is, it returns the location one past the last
107// relevant character.
108//
109// Associated tokens include comments, horizontal whitespace and 'Terminators'
110// -- optional tokens, which, if any are found, will be included; if
111// 'Terminators' is empty, we will not include any extra tokens beyond comments
112// and horizontal whitespace.
113static SourceLocation
114getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,
115 const std::set<tok::TokenKind> &Terminators,
116 const LangOptions &LangOpts) {
117 assert(EntityLast.isValid() && "Invalid end location found.");
118
119 // We remember the last location of a non-horizontal-whitespace token we have
120 // lexed; this is the location up to which we will want to delete.
121 // FIXME: Support using the spelling loc here for cases where we want to
122 // analyze the macro text.
123
124 CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
125 // FIXME: Should check isTokenRange(), for the (rare) case that
126 // `ExpansionRange` is a character range.
127 std::unique_ptr<Lexer> Lexer = [&]() {
128 bool Invalid = false;
129 auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
130 llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
131 assert(!Invalid && "Cannot get file/offset");
132 return std::make_unique<clang::Lexer>(
133 SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
134 File.data() + FileOffset.second, File.end());
135 }();
136
137 // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
138 Lexer->SetKeepWhitespaceMode(true);
139
140 // Generally, the code we want to include looks like this ([] are optional),
141 // If Terminators is empty:
142 // [ <comment> ] [ <newline> ]
143 // Otherwise:
144 // ... <terminator> [ <comment> ] [ <newline> ]
145
146 Token Tok;
147 bool Terminated = false;
148
149 // First, lex to the current token (which is the last token of the range that
150 // is definitely associated with the decl). Then, we process the first token
151 // separately from the rest based on conditions that hold specifically for
152 // that first token.
153 //
154 // We do not search for a terminator if none is required or we've already
155 // encountered it. Otherwise, if the original `EntityLast` location was in a
156 // macro expansion, we don't have visibility into the text, so we assume we've
157 // already terminated. However, we note this assumption with
158 // `TerminatedByMacro`, because we'll want to handle it somewhat differently
159 // for the terminators semicolon and comma. These terminators can be safely
160 // associated with the entity when they appear after the macro -- extra
161 // semicolons have no effect on the program and a well-formed program won't
162 // have multiple commas in a row, so we're guaranteed that there is only one.
163 //
164 // FIXME: This handling of macros is more conservative than necessary. When
165 // the end of the expansion coincides with the end of the node, we can still
166 // safely analyze the code. But, it is more complicated, because we need to
167 // start by lexing the spelling loc for the first token and then switch to the
168 // expansion loc.
169 bool TerminatedByMacro = false;
170 Lexer->LexFromRawLexer(Tok);
171 if (Terminators.empty() || contains(Terminators, Tok))
172 Terminated = true;
173 else if (EntityLast.isMacroID()) {
174 Terminated = true;
175 TerminatedByMacro = true;
176 }
177
178 // We save the most recent candidate for the exclusive end location.
179 SourceLocation End = Tok.getEndLoc();
180
181 while (!Terminated) {
182 // Lex the next token we want to possibly expand the range with.
183 Lexer->LexFromRawLexer(Tok);
184
185 switch (Tok.getKind()) {
186 case tok::eof:
187 // Unexpected separators.
188 case tok::l_brace:
189 case tok::r_brace:
190 case tok::comma:
191 return End;
192 // Whitespace pseudo-tokens.
193 case tok::unknown:
194 if (startsWithNewline(SM, Tok))
195 // Include at least until the end of the line.
196 End = Tok.getEndLoc();
197 break;
198 default:
199 if (contains(Terminators, Tok))
200 Terminated = true;
201 End = Tok.getEndLoc();
202 break;
203 }
204 }
205
206 do {
207 // Lex the next token we want to possibly expand the range with.
208 Lexer->LexFromRawLexer(Tok);
209
210 switch (Tok.getKind()) {
211 case tok::unknown:
212 if (startsWithNewline(SM, Tok))
213 // We're done, but include this newline.
214 return Tok.getEndLoc();
215 break;
216 case tok::comment:
217 // Include any comments we find on the way.
218 End = Tok.getEndLoc();
219 break;
220 case tok::semi:
221 case tok::comma:
222 if (TerminatedByMacro && contains(Terminators, Tok)) {
223 End = Tok.getEndLoc();
224 // We've found a real terminator.
225 TerminatedByMacro = false;
226 break;
227 }
228 // Found an unrelated token; stop and don't include it.
229 return End;
230 default:
231 // Found an unrelated token; stop and don't include it.
232 return End;
233 }
234 } while (true);
235}
236
237// Returns the expected terminator tokens for the given declaration.
238//
239// If we do not know the correct terminator token, returns an empty set.
240//
241// There are cases where we have more than one possible terminator (for example,
242// we find either a comma or a semicolon after a VarDecl).
243static std::set<tok::TokenKind> getTerminators(const Decl &D) {
244 if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
245 return {tok::semi};
246
247 if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
248 return {tok::r_brace, tok::semi};
249
250 if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
251 return {tok::comma, tok::semi};
252
253 return {};
254}
255
256// Starting from `Loc`, skips whitespace up to, and including, a single
257// newline. Returns the (exclusive) end of any skipped whitespace (that is, the
258// location immediately after the whitespace).
259static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,
260 SourceLocation Loc,
261 const LangOptions &LangOpts) {
262 const char *LocChars = SM.getCharacterData(Loc);
263 int i = 0;
264 while (isHorizontalWhitespace(LocChars[i]))
265 ++i;
266 if (isVerticalWhitespace(LocChars[i]))
267 ++i;
268 return Loc.getLocWithOffset(i);
269}
270
271// Is `Loc` separated from any following decl by something meaningful (e.g. an
272// empty line, a comment), ignoring horizontal whitespace? Since this is a
273// heuristic, we return false when in doubt. `Loc` cannot be the first location
274// in the file.
275static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,
276 const LangOptions &LangOpts) {
277 // If the preceding character is a newline, we'll check for an empty line as a
278 // separator. However, we can't identify an empty line using tokens, so we
279 // analyse the characters. If we try to use tokens, we'll just end up with a
280 // whitespace token, whose characters we'd have to analyse anyhow.
281 bool Invalid = false;
282 const char *LocChars =
283 SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
284 assert(!Invalid &&
285 "Loc must be a valid character and not the first of the source file.");
286 if (isVerticalWhitespace(LocChars[0])) {
287 for (int i = 1; isWhitespace(LocChars[i]); ++i)
288 if (isVerticalWhitespace(LocChars[i]))
289 return true;
290 }
291 // We didn't find an empty line, so lex the next token, skipping past any
292 // whitespace we just scanned.
293 Token Tok;
294 bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
295 /*IgnoreWhiteSpace=*/true);
296 if (Failed)
297 // Any text that confuses the lexer seems fair to consider a separation.
298 return true;
299
300 switch (Tok.getKind()) {
301 case tok::comment:
302 case tok::l_brace:
303 case tok::r_brace:
304 case tok::eof:
305 return true;
306 default:
307 return false;
308 }
309}
310
311CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
312 ASTContext &Context) {
313 const SourceManager &SM = Context.getSourceManager();
314 const LangOptions &LangOpts = Context.getLangOpts();
315 CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
316
317 // First, expand to the start of the template<> declaration if necessary.
318 if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
319 if (const auto *T = Record->getDescribedClassTemplate())
320 if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
321 Range.setBegin(T->getBeginLoc());
322 } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
323 if (const auto *T = F->getDescribedFunctionTemplate())
324 if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
325 Range.setBegin(T->getBeginLoc());
326 }
327
328 // Next, expand the end location past trailing comments to include a potential
329 // newline at the end of the decl's line.
330 Range.setEnd(
331 getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));
332 Range.setTokenRange(false);
333
334 // Expand to include preceeding associated comments. We ignore any comments
335 // that are not preceeding the decl, since we've already skipped trailing
336 // comments with getEntityEndLoc.
337 if (const RawComment *Comment =
338 Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
339 // Only include a preceding comment if:
340 // * it is *not* separate from the declaration (not including any newline
341 // that immediately follows the comment),
342 // * the decl *is* separate from any following entity (so, there are no
343 // other entities the comment could refer to), and
344 // * it is not a IfThisThenThat lint check.
345 if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
346 Range.getBegin()) &&
347 !atOrBeforeSeparation(
348 SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
349 LangOpts) &&
350 atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
351 const StringRef CommentText = Comment->getRawText(SM);
352 if (!CommentText.contains("LINT.IfChange") &&
353 !CommentText.contains("LINT.ThenChange"))
354 Range.setBegin(Comment->getBeginLoc());
355 }
356 // Add leading attributes.
357 for (auto *Attr : Decl.attrs()) {
358 if (Attr->getLocation().isInvalid() ||
359 !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
360 continue;
361 Range.setBegin(Attr->getLocation());
362
363 // Extend to the left '[[' or '__attribute((' if we saw the attribute,
364 // unless it is not a valid location.
365 bool Invalid;
366 StringRef Source =
367 SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
368 if (Invalid)
369 continue;
370 llvm::StringRef BeforeAttr =
371 Source.substr(0, SM.getFileOffset(Range.getBegin()));
372 llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
373
374 for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
375 // Handle whitespace between attribute prefix and attribute value.
376 if (BeforeAttrStripped.endswith(Prefix)) {
377 // Move start to start position of prefix, which is
378 // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
379 // positions to the left.
380 Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
381 -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
382 break;
383 // If we didn't see '[[' or '__attribute' it's probably coming from a
384 // macro expansion which is already handled by makeFileCharRange(),
385 // below.
386 }
387 }
388 }
389
390 // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
391 // Range.getBegin() may be inside an expansion.
392 return Lexer::makeFileCharRange(Range, SM, LangOpts);
393}