Blame - clang/lib/Tooling/Transformer/SourceCode.cpp - toolchain/llvm-project

blob: 38b331b0a1c3b141db2188bb4dc93caf88de8af6 [file] [log] [blame]

Yitzhak Mandelbaum	84f2271	2019-04-05 14:05:03 +0000	[diff] [blame]	1	//===--- SourceCode.cpp - Source code manipulation routines ------ C++ --===//
				2	//
				3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	// See https://llvm.org/LICENSE.txt for license information.
				5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				6	//
				7	//===----------------------------------------------------------------------===//
				8	//
				9	// This file provides functions that simplify extraction of source code.
				10	//
				11	//===----------------------------------------------------------------------===//
Yitzhak Mandelbaum	fbdf835	2019-10-10 02:34:47 +0000	[diff] [blame]	12	#include "clang/Tooling/Transformer/SourceCode.h"
Yitzhak Mandelbaum	38b4516	2020-02-26 08:14:19 -0500	[diff] [blame]	13	#include "clang/AST/ASTContext.h"
				14	#include "clang/AST/Attr.h"
				15	#include "clang/AST/Comment.h"
				16	#include "clang/AST/Decl.h"
				17	#include "clang/AST/DeclCXX.h"
				18	#include "clang/AST/DeclTemplate.h"
				19	#include "clang/AST/Expr.h"
Reid Kleckner	86565c1	2020-02-27 11:01:58 -0800	[diff] [blame]	20	#include "clang/Basic/SourceManager.h"
Yitzhak Mandelbaum	84f2271	2019-04-05 14:05:03 +0000	[diff] [blame]	21	#include "clang/Lex/Lexer.h"
Yitzhak Mandelbaum	b9d2bf3	2020-01-06 11:00:44 -0500	[diff] [blame]	22	#include "llvm/Support/Errc.h"
Yitzhak Mandelbaum	84f2271	2019-04-05 14:05:03 +0000	[diff] [blame]	23
				24	using namespace clang;
				25
Yitzhak Mandelbaum	b9d2bf3	2020-01-06 11:00:44 -0500	[diff] [blame]	26	using llvm::errc;
				27	using llvm::StringError;
				28
Yitzhak Mandelbaum	84f2271	2019-04-05 14:05:03 +0000	[diff] [blame]	29	StringRef clang::tooling::getText(CharSourceRange Range,
				30	const ASTContext &Context) {
				31	return Lexer::getSourceText(Range, Context.getSourceManager(),
				32	Context.getLangOpts());
				33	}
				34
				35	CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,
				36	tok::TokenKind Next,
				37	ASTContext &Context) {
				38	Optional<Token> Tok = Lexer::findNextToken(
				39	Range.getEnd(), Context.getSourceManager(), Context.getLangOpts());
				40	if (!Tok \|\| !Tok->is(Next))
				41	return Range;
				42	return CharSourceRange::getTokenRange(Range.getBegin(), Tok->getLocation());
				43	}
Yitzhak Mandelbaum	2e97a1e	2019-07-18 17:26:57 +0000	[diff] [blame]	44
Yitzhak Mandelbaum	b9d2bf3	2020-01-06 11:00:44 -0500	[diff] [blame]	45	llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,
				46	const SourceManager &SM) {
				47	if (Range.isInvalid())
				48	return llvm::make_error<StringError>(errc::invalid_argument,
				49	"Invalid range");
				50
				51	if (Range.getBegin().isMacroID() \|\| Range.getEnd().isMacroID())
				52	return llvm::make_error<StringError>(
				53	errc::invalid_argument, "Range starts or ends in a macro expansion");
				54
				55	if (SM.isInSystemHeader(Range.getBegin()) \|\|
				56	SM.isInSystemHeader(Range.getEnd()))
				57	return llvm::make_error<StringError>(errc::invalid_argument,
				58	"Range is in system header");
				59
				60	std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
				61	std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
				62	if (BeginInfo.first != EndInfo.first)
				63	return llvm::make_error<StringError>(
				64	errc::invalid_argument, "Range begins and ends in different files");
				65
				66	if (BeginInfo.second > EndInfo.second)
				67	return llvm::make_error<StringError>(
				68	errc::invalid_argument, "Range's begin is past its end");
				69
				70	return llvm::Error::success();
				71	}
				72
Yitzhak Mandelbaum	2e97a1e	2019-07-18 17:26:57 +0000	[diff] [blame]	73	llvm::Optional<CharSourceRange>
				74	clang::tooling::getRangeForEdit(const CharSourceRange &EditRange,
				75	const SourceManager &SM,
				76	const LangOptions &LangOpts) {
				77	// FIXME: makeFileCharRange() has the disadvantage of stripping off "identity"
				78	// macros. For example, if we're looking to rewrite the int literal 3 to 6,
				79	// and we have the following definition:
				80	// #define DO_NOTHING(x) x
				81	// then
				82	// foo(DO_NOTHING(3))
				83	// will be rewritten to
				84	// foo(6)
				85	// rather than the arguably better
				86	// foo(DO_NOTHING(6))
				87	// Decide whether the current behavior is desirable and modify if not.
				88	CharSourceRange Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
Yitzhak Mandelbaum	b9d2bf3	2020-01-06 11:00:44 -0500	[diff] [blame]	89	bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
				90	if (IsInvalid)
				91	return llvm::None;
Yitzhak Mandelbaum	2e97a1e	2019-07-18 17:26:57 +0000	[diff] [blame]	92	return Range;
Yitzhak Mandelbaum	b9d2bf3	2020-01-06 11:00:44 -0500	[diff] [blame]	93
Yitzhak Mandelbaum	2e97a1e	2019-07-18 17:26:57 +0000	[diff] [blame]	94	}
Yitzhak Mandelbaum	38b4516	2020-02-26 08:14:19 -0500	[diff] [blame]	95
				96	static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
				97	return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
				98	}
				99
				100	static bool contains(const std::set<tok::TokenKind> &Terminators,
				101	const Token &Tok) {
				102	return Terminators.count(Tok.getKind()) > 0;
				103	}
				104
				105	// Returns the exclusive, file end location of the entity whose last token is
				106	// at location 'EntityLast'. That is, it returns the location one past the last
				107	// relevant character.
				108	//
				109	// Associated tokens include comments, horizontal whitespace and 'Terminators'
				110	// -- optional tokens, which, if any are found, will be included; if
				111	// 'Terminators' is empty, we will not include any extra tokens beyond comments
				112	// and horizontal whitespace.
				113	static SourceLocation
				114	getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,
				115	const std::set<tok::TokenKind> &Terminators,
				116	const LangOptions &LangOpts) {
				117	assert(EntityLast.isValid() && "Invalid end location found.");
				118
				119	// We remember the last location of a non-horizontal-whitespace token we have
				120	// lexed; this is the location up to which we will want to delete.
				121	// FIXME: Support using the spelling loc here for cases where we want to
				122	// analyze the macro text.
				123
				124	CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
				125	// FIXME: Should check isTokenRange(), for the (rare) case that
				126	// `ExpansionRange` is a character range.
				127	std::unique_ptr<Lexer> Lexer = [&]() {
				128	bool Invalid = false;
				129	auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
				130	llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
				131	assert(!Invalid && "Cannot get file/offset");
				132	return std::make_unique<clang::Lexer>(
				133	SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
				134	File.data() + FileOffset.second, File.end());
				135	}();
				136
				137	// Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
				138	Lexer->SetKeepWhitespaceMode(true);
				139
				140	// Generally, the code we want to include looks like this ([] are optional),
				141	// If Terminators is empty:
				142	// [ <comment> ] [ <newline> ]
				143	// Otherwise:
				144	// ... <terminator> [ <comment> ] [ <newline> ]
				145
				146	Token Tok;
				147	bool Terminated = false;
				148
				149	// First, lex to the current token (which is the last token of the range that
				150	// is definitely associated with the decl). Then, we process the first token
				151	// separately from the rest based on conditions that hold specifically for
				152	// that first token.
				153	//
				154	// We do not search for a terminator if none is required or we've already
				155	// encountered it. Otherwise, if the original `EntityLast` location was in a
				156	// macro expansion, we don't have visibility into the text, so we assume we've
				157	// already terminated. However, we note this assumption with
				158	// `TerminatedByMacro`, because we'll want to handle it somewhat differently
				159	// for the terminators semicolon and comma. These terminators can be safely
				160	// associated with the entity when they appear after the macro -- extra
				161	// semicolons have no effect on the program and a well-formed program won't
				162	// have multiple commas in a row, so we're guaranteed that there is only one.
				163	//
				164	// FIXME: This handling of macros is more conservative than necessary. When
				165	// the end of the expansion coincides with the end of the node, we can still
				166	// safely analyze the code. But, it is more complicated, because we need to
				167	// start by lexing the spelling loc for the first token and then switch to the
				168	// expansion loc.
				169	bool TerminatedByMacro = false;
				170	Lexer->LexFromRawLexer(Tok);
				171	if (Terminators.empty() \|\| contains(Terminators, Tok))
				172	Terminated = true;
				173	else if (EntityLast.isMacroID()) {
				174	Terminated = true;
				175	TerminatedByMacro = true;
				176	}
				177
				178	// We save the most recent candidate for the exclusive end location.
				179	SourceLocation End = Tok.getEndLoc();
				180
				181	while (!Terminated) {
				182	// Lex the next token we want to possibly expand the range with.
				183	Lexer->LexFromRawLexer(Tok);
				184
				185	switch (Tok.getKind()) {
				186	case tok::eof:
				187	// Unexpected separators.
				188	case tok::l_brace:
				189	case tok::r_brace:
				190	case tok::comma:
				191	return End;
				192	// Whitespace pseudo-tokens.
				193	case tok::unknown:
				194	if (startsWithNewline(SM, Tok))
				195	// Include at least until the end of the line.
				196	End = Tok.getEndLoc();
				197	break;
				198	default:
				199	if (contains(Terminators, Tok))
				200	Terminated = true;
				201	End = Tok.getEndLoc();
				202	break;
				203	}
				204	}
				205
				206	do {
				207	// Lex the next token we want to possibly expand the range with.
				208	Lexer->LexFromRawLexer(Tok);
				209
				210	switch (Tok.getKind()) {
				211	case tok::unknown:
				212	if (startsWithNewline(SM, Tok))
				213	// We're done, but include this newline.
				214	return Tok.getEndLoc();
				215	break;
				216	case tok::comment:
				217	// Include any comments we find on the way.
				218	End = Tok.getEndLoc();
				219	break;
				220	case tok::semi:
				221	case tok::comma:
				222	if (TerminatedByMacro && contains(Terminators, Tok)) {
				223	End = Tok.getEndLoc();
				224	// We've found a real terminator.
				225	TerminatedByMacro = false;
				226	break;
				227	}
				228	// Found an unrelated token; stop and don't include it.
				229	return End;
				230	default:
				231	// Found an unrelated token; stop and don't include it.
				232	return End;
				233	}
				234	} while (true);
				235	}
				236
				237	// Returns the expected terminator tokens for the given declaration.
				238	//
				239	// If we do not know the correct terminator token, returns an empty set.
				240	//
				241	// There are cases where we have more than one possible terminator (for example,
				242	// we find either a comma or a semicolon after a VarDecl).
				243	static std::set<tok::TokenKind> getTerminators(const Decl &D) {
				244	if (llvm::isa<RecordDecl>(D) \|\| llvm::isa<UsingDecl>(D))
				245	return {tok::semi};
				246
				247	if (llvm::isa<FunctionDecl>(D) \|\| llvm::isa<LinkageSpecDecl>(D))
				248	return {tok::r_brace, tok::semi};
				249
				250	if (llvm::isa<VarDecl>(D) \|\| llvm::isa<FieldDecl>(D))
				251	return {tok::comma, tok::semi};
				252
				253	return {};
				254	}
				255
				256	// Starting from `Loc`, skips whitespace up to, and including, a single
				257	// newline. Returns the (exclusive) end of any skipped whitespace (that is, the
				258	// location immediately after the whitespace).
				259	static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,
				260	SourceLocation Loc,
				261	const LangOptions &LangOpts) {
				262	const char *LocChars = SM.getCharacterData(Loc);
				263	int i = 0;
				264	while (isHorizontalWhitespace(LocChars[i]))
				265	++i;
				266	if (isVerticalWhitespace(LocChars[i]))
				267	++i;
				268	return Loc.getLocWithOffset(i);
				269	}
				270
				271	// Is `Loc` separated from any following decl by something meaningful (e.g. an
				272	// empty line, a comment), ignoring horizontal whitespace? Since this is a
				273	// heuristic, we return false when in doubt. `Loc` cannot be the first location
				274	// in the file.
				275	static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,
				276	const LangOptions &LangOpts) {
				277	// If the preceding character is a newline, we'll check for an empty line as a
				278	// separator. However, we can't identify an empty line using tokens, so we
				279	// analyse the characters. If we try to use tokens, we'll just end up with a
				280	// whitespace token, whose characters we'd have to analyse anyhow.
				281	bool Invalid = false;
				282	const char *LocChars =
				283	SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
				284	assert(!Invalid &&
				285	"Loc must be a valid character and not the first of the source file.");
				286	if (isVerticalWhitespace(LocChars[0])) {
				287	for (int i = 1; isWhitespace(LocChars[i]); ++i)
				288	if (isVerticalWhitespace(LocChars[i]))
				289	return true;
				290	}
				291	// We didn't find an empty line, so lex the next token, skipping past any
				292	// whitespace we just scanned.
				293	Token Tok;
				294	bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
				295	/IgnoreWhiteSpace=/true);
				296	if (Failed)
				297	// Any text that confuses the lexer seems fair to consider a separation.
				298	return true;
				299
				300	switch (Tok.getKind()) {
				301	case tok::comment:
				302	case tok::l_brace:
				303	case tok::r_brace:
				304	case tok::eof:
				305	return true;
				306	default:
				307	return false;
				308	}
				309	}
				310
				311	CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
				312	ASTContext &Context) {
				313	const SourceManager &SM = Context.getSourceManager();
				314	const LangOptions &LangOpts = Context.getLangOpts();
				315	CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
				316
				317	// First, expand to the start of the template<> declaration if necessary.
				318	if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
				319	if (const auto *T = Record->getDescribedClassTemplate())
				320	if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
				321	Range.setBegin(T->getBeginLoc());
				322	} else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
				323	if (const auto *T = F->getDescribedFunctionTemplate())
				324	if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
				325	Range.setBegin(T->getBeginLoc());
				326	}
				327
				328	// Next, expand the end location past trailing comments to include a potential
				329	// newline at the end of the decl's line.
				330	Range.setEnd(
				331	getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));
				332	Range.setTokenRange(false);
				333
				334	// Expand to include preceeding associated comments. We ignore any comments
				335	// that are not preceeding the decl, since we've already skipped trailing
				336	// comments with getEntityEndLoc.
				337	if (const RawComment *Comment =
				338	Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
				339	// Only include a preceding comment if:
				340	// * it is not separate from the declaration (not including any newline
				341	// that immediately follows the comment),
				342	// * the decl is separate from any following entity (so, there are no
				343	// other entities the comment could refer to), and
				344	// * it is not a IfThisThenThat lint check.
				345	if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
				346	Range.getBegin()) &&
				347	!atOrBeforeSeparation(
				348	SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
				349	LangOpts) &&
				350	atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
				351	const StringRef CommentText = Comment->getRawText(SM);
				352	if (!CommentText.contains("LINT.IfChange") &&
				353	!CommentText.contains("LINT.ThenChange"))
				354	Range.setBegin(Comment->getBeginLoc());
				355	}
				356	// Add leading attributes.
				357	for (auto *Attr : Decl.attrs()) {
				358	if (Attr->getLocation().isInvalid() \|\|
				359	!SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
				360	continue;
				361	Range.setBegin(Attr->getLocation());
				362
				363	// Extend to the left '[[' or '__attribute((' if we saw the attribute,
				364	// unless it is not a valid location.
				365	bool Invalid;
				366	StringRef Source =
				367	SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
				368	if (Invalid)
				369	continue;
				370	llvm::StringRef BeforeAttr =
				371	Source.substr(0, SM.getFileOffset(Range.getBegin()));
				372	llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
				373
				374	for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
				375	// Handle whitespace between attribute prefix and attribute value.
				376	if (BeforeAttrStripped.endswith(Prefix)) {
				377	// Move start to start position of prefix, which is
				378	// length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
				379	// positions to the left.
				380	Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
				381	-BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
				382	break;
				383	// If we didn't see '[[' or '__attribute' it's probably coming from a
				384	// macro expansion which is already handled by makeFileCharRange(),
				385	// below.
				386	}
				387	}
				388	}
				389
				390	// Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
				391	// Range.getBegin() may be inside an expansion.
				392	return Lexer::makeFileCharRange(Range, SM, LangOpts);
				393	}