Yitzhak Mandelbaum | 84f2271 | 2019-04-05 14:05:03 +0000 | [diff] [blame] | 1 | //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file provides functions that simplify extraction of source code. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
Yitzhak Mandelbaum | fbdf835 | 2019-10-10 02:34:47 +0000 | [diff] [blame] | 12 | #include "clang/Tooling/Transformer/SourceCode.h" |
Yitzhak Mandelbaum | 38b4516 | 2020-02-26 08:14:19 -0500 | [diff] [blame] | 13 | #include "clang/AST/ASTContext.h" |
| 14 | #include "clang/AST/Attr.h" |
| 15 | #include "clang/AST/Comment.h" |
| 16 | #include "clang/AST/Decl.h" |
| 17 | #include "clang/AST/DeclCXX.h" |
| 18 | #include "clang/AST/DeclTemplate.h" |
| 19 | #include "clang/AST/Expr.h" |
Reid Kleckner | 86565c1 | 2020-02-27 11:01:58 -0800 | [diff] [blame] | 20 | #include "clang/Basic/SourceManager.h" |
Yitzhak Mandelbaum | 84f2271 | 2019-04-05 14:05:03 +0000 | [diff] [blame] | 21 | #include "clang/Lex/Lexer.h" |
Yitzhak Mandelbaum | b9d2bf3 | 2020-01-06 11:00:44 -0500 | [diff] [blame] | 22 | #include "llvm/Support/Errc.h" |
Yitzhak Mandelbaum | 84f2271 | 2019-04-05 14:05:03 +0000 | [diff] [blame] | 23 | |
| 24 | using namespace clang; |
| 25 | |
Yitzhak Mandelbaum | b9d2bf3 | 2020-01-06 11:00:44 -0500 | [diff] [blame] | 26 | using llvm::errc; |
| 27 | using llvm::StringError; |
| 28 | |
Yitzhak Mandelbaum | 84f2271 | 2019-04-05 14:05:03 +0000 | [diff] [blame] | 29 | StringRef clang::tooling::getText(CharSourceRange Range, |
| 30 | const ASTContext &Context) { |
| 31 | return Lexer::getSourceText(Range, Context.getSourceManager(), |
| 32 | Context.getLangOpts()); |
| 33 | } |
| 34 | |
| 35 | CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range, |
| 36 | tok::TokenKind Next, |
| 37 | ASTContext &Context) { |
| 38 | Optional<Token> Tok = Lexer::findNextToken( |
| 39 | Range.getEnd(), Context.getSourceManager(), Context.getLangOpts()); |
| 40 | if (!Tok || !Tok->is(Next)) |
| 41 | return Range; |
| 42 | return CharSourceRange::getTokenRange(Range.getBegin(), Tok->getLocation()); |
| 43 | } |
Yitzhak Mandelbaum | 2e97a1e | 2019-07-18 17:26:57 +0000 | [diff] [blame] | 44 | |
Yitzhak Mandelbaum | b9d2bf3 | 2020-01-06 11:00:44 -0500 | [diff] [blame] | 45 | llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range, |
| 46 | const SourceManager &SM) { |
| 47 | if (Range.isInvalid()) |
| 48 | return llvm::make_error<StringError>(errc::invalid_argument, |
| 49 | "Invalid range"); |
| 50 | |
| 51 | if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID()) |
| 52 | return llvm::make_error<StringError>( |
| 53 | errc::invalid_argument, "Range starts or ends in a macro expansion"); |
| 54 | |
| 55 | if (SM.isInSystemHeader(Range.getBegin()) || |
| 56 | SM.isInSystemHeader(Range.getEnd())) |
| 57 | return llvm::make_error<StringError>(errc::invalid_argument, |
| 58 | "Range is in system header"); |
| 59 | |
| 60 | std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin()); |
| 61 | std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd()); |
| 62 | if (BeginInfo.first != EndInfo.first) |
| 63 | return llvm::make_error<StringError>( |
| 64 | errc::invalid_argument, "Range begins and ends in different files"); |
| 65 | |
| 66 | if (BeginInfo.second > EndInfo.second) |
| 67 | return llvm::make_error<StringError>( |
| 68 | errc::invalid_argument, "Range's begin is past its end"); |
| 69 | |
| 70 | return llvm::Error::success(); |
| 71 | } |
| 72 | |
Yitzhak Mandelbaum | 2e97a1e | 2019-07-18 17:26:57 +0000 | [diff] [blame] | 73 | llvm::Optional<CharSourceRange> |
| 74 | clang::tooling::getRangeForEdit(const CharSourceRange &EditRange, |
| 75 | const SourceManager &SM, |
| 76 | const LangOptions &LangOpts) { |
| 77 | // FIXME: makeFileCharRange() has the disadvantage of stripping off "identity" |
| 78 | // macros. For example, if we're looking to rewrite the int literal 3 to 6, |
| 79 | // and we have the following definition: |
| 80 | // #define DO_NOTHING(x) x |
| 81 | // then |
| 82 | // foo(DO_NOTHING(3)) |
| 83 | // will be rewritten to |
| 84 | // foo(6) |
| 85 | // rather than the arguably better |
| 86 | // foo(DO_NOTHING(6)) |
| 87 | // Decide whether the current behavior is desirable and modify if not. |
| 88 | CharSourceRange Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts); |
Yitzhak Mandelbaum | b9d2bf3 | 2020-01-06 11:00:44 -0500 | [diff] [blame] | 89 | bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM)); |
| 90 | if (IsInvalid) |
| 91 | return llvm::None; |
Yitzhak Mandelbaum | 2e97a1e | 2019-07-18 17:26:57 +0000 | [diff] [blame] | 92 | return Range; |
Yitzhak Mandelbaum | b9d2bf3 | 2020-01-06 11:00:44 -0500 | [diff] [blame] | 93 | |
Yitzhak Mandelbaum | 2e97a1e | 2019-07-18 17:26:57 +0000 | [diff] [blame] | 94 | } |
Yitzhak Mandelbaum | 38b4516 | 2020-02-26 08:14:19 -0500 | [diff] [blame] | 95 | |
| 96 | static bool startsWithNewline(const SourceManager &SM, const Token &Tok) { |
| 97 | return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]); |
| 98 | } |
| 99 | |
| 100 | static bool contains(const std::set<tok::TokenKind> &Terminators, |
| 101 | const Token &Tok) { |
| 102 | return Terminators.count(Tok.getKind()) > 0; |
| 103 | } |
| 104 | |
| 105 | // Returns the exclusive, *file* end location of the entity whose last token is |
| 106 | // at location 'EntityLast'. That is, it returns the location one past the last |
| 107 | // relevant character. |
| 108 | // |
| 109 | // Associated tokens include comments, horizontal whitespace and 'Terminators' |
| 110 | // -- optional tokens, which, if any are found, will be included; if |
| 111 | // 'Terminators' is empty, we will not include any extra tokens beyond comments |
| 112 | // and horizontal whitespace. |
| 113 | static SourceLocation |
| 114 | getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast, |
| 115 | const std::set<tok::TokenKind> &Terminators, |
| 116 | const LangOptions &LangOpts) { |
| 117 | assert(EntityLast.isValid() && "Invalid end location found."); |
| 118 | |
| 119 | // We remember the last location of a non-horizontal-whitespace token we have |
| 120 | // lexed; this is the location up to which we will want to delete. |
| 121 | // FIXME: Support using the spelling loc here for cases where we want to |
| 122 | // analyze the macro text. |
| 123 | |
| 124 | CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast); |
| 125 | // FIXME: Should check isTokenRange(), for the (rare) case that |
| 126 | // `ExpansionRange` is a character range. |
| 127 | std::unique_ptr<Lexer> Lexer = [&]() { |
| 128 | bool Invalid = false; |
| 129 | auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd()); |
| 130 | llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid); |
| 131 | assert(!Invalid && "Cannot get file/offset"); |
| 132 | return std::make_unique<clang::Lexer>( |
| 133 | SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(), |
| 134 | File.data() + FileOffset.second, File.end()); |
| 135 | }(); |
| 136 | |
| 137 | // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown). |
| 138 | Lexer->SetKeepWhitespaceMode(true); |
| 139 | |
| 140 | // Generally, the code we want to include looks like this ([] are optional), |
| 141 | // If Terminators is empty: |
| 142 | // [ <comment> ] [ <newline> ] |
| 143 | // Otherwise: |
| 144 | // ... <terminator> [ <comment> ] [ <newline> ] |
| 145 | |
| 146 | Token Tok; |
| 147 | bool Terminated = false; |
| 148 | |
| 149 | // First, lex to the current token (which is the last token of the range that |
| 150 | // is definitely associated with the decl). Then, we process the first token |
| 151 | // separately from the rest based on conditions that hold specifically for |
| 152 | // that first token. |
| 153 | // |
| 154 | // We do not search for a terminator if none is required or we've already |
| 155 | // encountered it. Otherwise, if the original `EntityLast` location was in a |
| 156 | // macro expansion, we don't have visibility into the text, so we assume we've |
| 157 | // already terminated. However, we note this assumption with |
| 158 | // `TerminatedByMacro`, because we'll want to handle it somewhat differently |
| 159 | // for the terminators semicolon and comma. These terminators can be safely |
| 160 | // associated with the entity when they appear after the macro -- extra |
| 161 | // semicolons have no effect on the program and a well-formed program won't |
| 162 | // have multiple commas in a row, so we're guaranteed that there is only one. |
| 163 | // |
| 164 | // FIXME: This handling of macros is more conservative than necessary. When |
| 165 | // the end of the expansion coincides with the end of the node, we can still |
| 166 | // safely analyze the code. But, it is more complicated, because we need to |
| 167 | // start by lexing the spelling loc for the first token and then switch to the |
| 168 | // expansion loc. |
| 169 | bool TerminatedByMacro = false; |
| 170 | Lexer->LexFromRawLexer(Tok); |
| 171 | if (Terminators.empty() || contains(Terminators, Tok)) |
| 172 | Terminated = true; |
| 173 | else if (EntityLast.isMacroID()) { |
| 174 | Terminated = true; |
| 175 | TerminatedByMacro = true; |
| 176 | } |
| 177 | |
| 178 | // We save the most recent candidate for the exclusive end location. |
| 179 | SourceLocation End = Tok.getEndLoc(); |
| 180 | |
| 181 | while (!Terminated) { |
| 182 | // Lex the next token we want to possibly expand the range with. |
| 183 | Lexer->LexFromRawLexer(Tok); |
| 184 | |
| 185 | switch (Tok.getKind()) { |
| 186 | case tok::eof: |
| 187 | // Unexpected separators. |
| 188 | case tok::l_brace: |
| 189 | case tok::r_brace: |
| 190 | case tok::comma: |
| 191 | return End; |
| 192 | // Whitespace pseudo-tokens. |
| 193 | case tok::unknown: |
| 194 | if (startsWithNewline(SM, Tok)) |
| 195 | // Include at least until the end of the line. |
| 196 | End = Tok.getEndLoc(); |
| 197 | break; |
| 198 | default: |
| 199 | if (contains(Terminators, Tok)) |
| 200 | Terminated = true; |
| 201 | End = Tok.getEndLoc(); |
| 202 | break; |
| 203 | } |
| 204 | } |
| 205 | |
| 206 | do { |
| 207 | // Lex the next token we want to possibly expand the range with. |
| 208 | Lexer->LexFromRawLexer(Tok); |
| 209 | |
| 210 | switch (Tok.getKind()) { |
| 211 | case tok::unknown: |
| 212 | if (startsWithNewline(SM, Tok)) |
| 213 | // We're done, but include this newline. |
| 214 | return Tok.getEndLoc(); |
| 215 | break; |
| 216 | case tok::comment: |
| 217 | // Include any comments we find on the way. |
| 218 | End = Tok.getEndLoc(); |
| 219 | break; |
| 220 | case tok::semi: |
| 221 | case tok::comma: |
| 222 | if (TerminatedByMacro && contains(Terminators, Tok)) { |
| 223 | End = Tok.getEndLoc(); |
| 224 | // We've found a real terminator. |
| 225 | TerminatedByMacro = false; |
| 226 | break; |
| 227 | } |
| 228 | // Found an unrelated token; stop and don't include it. |
| 229 | return End; |
| 230 | default: |
| 231 | // Found an unrelated token; stop and don't include it. |
| 232 | return End; |
| 233 | } |
| 234 | } while (true); |
| 235 | } |
| 236 | |
| 237 | // Returns the expected terminator tokens for the given declaration. |
| 238 | // |
| 239 | // If we do not know the correct terminator token, returns an empty set. |
| 240 | // |
| 241 | // There are cases where we have more than one possible terminator (for example, |
| 242 | // we find either a comma or a semicolon after a VarDecl). |
| 243 | static std::set<tok::TokenKind> getTerminators(const Decl &D) { |
| 244 | if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D)) |
| 245 | return {tok::semi}; |
| 246 | |
| 247 | if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D)) |
| 248 | return {tok::r_brace, tok::semi}; |
| 249 | |
| 250 | if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D)) |
| 251 | return {tok::comma, tok::semi}; |
| 252 | |
| 253 | return {}; |
| 254 | } |
| 255 | |
| 256 | // Starting from `Loc`, skips whitespace up to, and including, a single |
| 257 | // newline. Returns the (exclusive) end of any skipped whitespace (that is, the |
| 258 | // location immediately after the whitespace). |
| 259 | static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM, |
| 260 | SourceLocation Loc, |
| 261 | const LangOptions &LangOpts) { |
| 262 | const char *LocChars = SM.getCharacterData(Loc); |
| 263 | int i = 0; |
| 264 | while (isHorizontalWhitespace(LocChars[i])) |
| 265 | ++i; |
| 266 | if (isVerticalWhitespace(LocChars[i])) |
| 267 | ++i; |
| 268 | return Loc.getLocWithOffset(i); |
| 269 | } |
| 270 | |
| 271 | // Is `Loc` separated from any following decl by something meaningful (e.g. an |
| 272 | // empty line, a comment), ignoring horizontal whitespace? Since this is a |
| 273 | // heuristic, we return false when in doubt. `Loc` cannot be the first location |
| 274 | // in the file. |
| 275 | static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc, |
| 276 | const LangOptions &LangOpts) { |
| 277 | // If the preceding character is a newline, we'll check for an empty line as a |
| 278 | // separator. However, we can't identify an empty line using tokens, so we |
| 279 | // analyse the characters. If we try to use tokens, we'll just end up with a |
| 280 | // whitespace token, whose characters we'd have to analyse anyhow. |
| 281 | bool Invalid = false; |
| 282 | const char *LocChars = |
| 283 | SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid); |
| 284 | assert(!Invalid && |
| 285 | "Loc must be a valid character and not the first of the source file."); |
| 286 | if (isVerticalWhitespace(LocChars[0])) { |
| 287 | for (int i = 1; isWhitespace(LocChars[i]); ++i) |
| 288 | if (isVerticalWhitespace(LocChars[i])) |
| 289 | return true; |
| 290 | } |
| 291 | // We didn't find an empty line, so lex the next token, skipping past any |
| 292 | // whitespace we just scanned. |
| 293 | Token Tok; |
| 294 | bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts, |
| 295 | /*IgnoreWhiteSpace=*/true); |
| 296 | if (Failed) |
| 297 | // Any text that confuses the lexer seems fair to consider a separation. |
| 298 | return true; |
| 299 | |
| 300 | switch (Tok.getKind()) { |
| 301 | case tok::comment: |
| 302 | case tok::l_brace: |
| 303 | case tok::r_brace: |
| 304 | case tok::eof: |
| 305 | return true; |
| 306 | default: |
| 307 | return false; |
| 308 | } |
| 309 | } |
| 310 | |
| 311 | CharSourceRange tooling::getAssociatedRange(const Decl &Decl, |
| 312 | ASTContext &Context) { |
| 313 | const SourceManager &SM = Context.getSourceManager(); |
| 314 | const LangOptions &LangOpts = Context.getLangOpts(); |
| 315 | CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange()); |
| 316 | |
| 317 | // First, expand to the start of the template<> declaration if necessary. |
| 318 | if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) { |
| 319 | if (const auto *T = Record->getDescribedClassTemplate()) |
| 320 | if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin())) |
| 321 | Range.setBegin(T->getBeginLoc()); |
| 322 | } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) { |
| 323 | if (const auto *T = F->getDescribedFunctionTemplate()) |
| 324 | if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin())) |
| 325 | Range.setBegin(T->getBeginLoc()); |
| 326 | } |
| 327 | |
| 328 | // Next, expand the end location past trailing comments to include a potential |
| 329 | // newline at the end of the decl's line. |
| 330 | Range.setEnd( |
| 331 | getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts)); |
| 332 | Range.setTokenRange(false); |
| 333 | |
| 334 | // Expand to include preceeding associated comments. We ignore any comments |
| 335 | // that are not preceeding the decl, since we've already skipped trailing |
| 336 | // comments with getEntityEndLoc. |
| 337 | if (const RawComment *Comment = |
| 338 | Decl.getASTContext().getRawCommentForDeclNoCache(&Decl)) |
| 339 | // Only include a preceding comment if: |
| 340 | // * it is *not* separate from the declaration (not including any newline |
| 341 | // that immediately follows the comment), |
| 342 | // * the decl *is* separate from any following entity (so, there are no |
| 343 | // other entities the comment could refer to), and |
| 344 | // * it is not a IfThisThenThat lint check. |
| 345 | if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(), |
| 346 | Range.getBegin()) && |
| 347 | !atOrBeforeSeparation( |
| 348 | SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts), |
| 349 | LangOpts) && |
| 350 | atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) { |
| 351 | const StringRef CommentText = Comment->getRawText(SM); |
| 352 | if (!CommentText.contains("LINT.IfChange") && |
| 353 | !CommentText.contains("LINT.ThenChange")) |
| 354 | Range.setBegin(Comment->getBeginLoc()); |
| 355 | } |
| 356 | // Add leading attributes. |
| 357 | for (auto *Attr : Decl.attrs()) { |
| 358 | if (Attr->getLocation().isInvalid() || |
| 359 | !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin())) |
| 360 | continue; |
| 361 | Range.setBegin(Attr->getLocation()); |
| 362 | |
| 363 | // Extend to the left '[[' or '__attribute((' if we saw the attribute, |
| 364 | // unless it is not a valid location. |
| 365 | bool Invalid; |
| 366 | StringRef Source = |
| 367 | SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid); |
| 368 | if (Invalid) |
| 369 | continue; |
| 370 | llvm::StringRef BeforeAttr = |
| 371 | Source.substr(0, SM.getFileOffset(Range.getBegin())); |
| 372 | llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim(); |
| 373 | |
| 374 | for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) { |
| 375 | // Handle whitespace between attribute prefix and attribute value. |
| 376 | if (BeforeAttrStripped.endswith(Prefix)) { |
| 377 | // Move start to start position of prefix, which is |
| 378 | // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix) |
| 379 | // positions to the left. |
| 380 | Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>( |
| 381 | -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size()))); |
| 382 | break; |
| 383 | // If we didn't see '[[' or '__attribute' it's probably coming from a |
| 384 | // macro expansion which is already handled by makeFileCharRange(), |
| 385 | // below. |
| 386 | } |
| 387 | } |
| 388 | } |
| 389 | |
| 390 | // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But, |
| 391 | // Range.getBegin() may be inside an expansion. |
| 392 | return Lexer::makeFileCharRange(Range, SM, LangOpts); |
| 393 | } |