Blame - clang/lib/Analysis/CloneDetection.cpp - toolchain/llvm-project

blob: 38db6ccb92363bc3b7f1a14d3ccad0706d47abb6 [file] [log] [blame]

Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	1	//===--- CloneDetection.cpp - Finds code clones in an AST -------- C++ --===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	///
				10	/// This file implements classes for searching and anlyzing source code clones.
				11	///
				12	//===----------------------------------------------------------------------===//
				13
				14	#include "clang/Analysis/CloneDetection.h"
				15
Johannes Altmanninger	1a26769	2017-08-23 16:28:26 +0000	[diff] [blame]	16	#include "clang/AST/DataCollection.h"
				17	#include "clang/AST/DeclTemplate.h"
Artem Dergachev	5657486	2016-08-20 17:35:53 +0000	[diff] [blame]	18	#include "llvm/Support/MD5.h"
Leslie Zhai	d91d19e	2017-06-19 01:55:50 +0000	[diff] [blame]	19	#include "llvm/Support/Path.h"
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	20
				21	using namespace clang;
				22
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	23	StmtSequence::StmtSequence(const CompoundStmt Stmt, const Decl D,
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	24	unsigned StartIndex, unsigned EndIndex)
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	25	: S(Stmt), D(D), StartIndex(StartIndex), EndIndex(EndIndex) {
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	26	assert(Stmt && "Stmt must not be a nullptr");
				27	assert(StartIndex < EndIndex && "Given array should not be empty");
				28	assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
				29	}
				30
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	31	StmtSequence::StmtSequence(const Stmt Stmt, const Decl D)
				32	: S(Stmt), D(D), StartIndex(0), EndIndex(0) {}
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	33
				34	StmtSequence::StmtSequence()
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	35	: S(nullptr), D(nullptr), StartIndex(0), EndIndex(0) {}
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	36
				37	bool StmtSequence::contains(const StmtSequence &Other) const {
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	38	// If both sequences reside in different declarations, they can never contain
				39	// each other.
				40	if (D != Other.D)
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	41	return false;
				42
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	43	const SourceManager &SM = getASTContext().getSourceManager();
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	44
				45	// Otherwise check if the start and end locations of the current sequence
				46	// surround the other sequence.
				47	bool StartIsInBounds =
Stephen Kelly	a6e4358	2018-08-09 21:05:56 +0000	[diff] [blame]	48	SM.isBeforeInTranslationUnit(getBeginLoc(), Other.getBeginLoc()) \|\|
				49	getBeginLoc() == Other.getBeginLoc();
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	50	if (!StartIsInBounds)
				51	return false;
				52
				53	bool EndIsInBounds =
				54	SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) \|\|
				55	Other.getEndLoc() == getEndLoc();
				56	return EndIsInBounds;
				57	}
				58
				59	StmtSequence::iterator StmtSequence::begin() const {
				60	if (!holdsSequence()) {
				61	return &S;
				62	}
				63	auto CS = cast<CompoundStmt>(S);
				64	return CS->body_begin() + StartIndex;
				65	}
				66
				67	StmtSequence::iterator StmtSequence::end() const {
				68	if (!holdsSequence()) {
Vassil Vassilev	5721e0f	2016-08-09 10:00:23 +0000	[diff] [blame]	69	return reinterpret_cast<StmtSequence::iterator>(&S) + 1;
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	70	}
				71	auto CS = cast<CompoundStmt>(S);
				72	return CS->body_begin() + EndIndex;
				73	}
				74
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	75	ASTContext &StmtSequence::getASTContext() const {
				76	assert(D);
				77	return D->getASTContext();
				78	}
				79
Stephen Kelly	94d33c0	2018-08-09 20:21:09 +0000	[diff] [blame]	80	SourceLocation StmtSequence::getBeginLoc() const {
Stephen Kelly	f2ceec4	2018-08-09 21:08:08 +0000	[diff] [blame^]	81	return front()->getBeginLoc();
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	82	}
				83
				84	SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
				85
Artem Dergachev	4eca0de	2016-10-08 10:54:30 +0000	[diff] [blame]	86	SourceRange StmtSequence::getSourceRange() const {
Stephen Kelly	a6e4358	2018-08-09 21:05:56 +0000	[diff] [blame]	87	return SourceRange(getBeginLoc(), getEndLoc());
Artem Dergachev	4eca0de	2016-10-08 10:54:30 +0000	[diff] [blame]	88	}
				89
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	90	void CloneDetector::analyzeCodeBody(const Decl *D) {
				91	assert(D);
				92	assert(D->hasBody());
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	93
				94	Sequences.push_back(StmtSequence(D->getBody(), D));
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	95	}
				96
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	97	/// Returns true if and only if \p Stmt contains at least one other
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	98	/// sequence in the \p Group.
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	99	static bool containsAnyInGroup(StmtSequence &Seq,
				100	CloneDetector::CloneGroup &Group) {
				101	for (StmtSequence &GroupSeq : Group) {
				102	if (Seq.contains(GroupSeq))
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	103	return true;
				104	}
				105	return false;
				106	}
				107
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	108	/// Returns true if and only if all sequences in \p OtherGroup are
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	109	/// contained by a sequence in \p Group.
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	110	static bool containsGroup(CloneDetector::CloneGroup &Group,
				111	CloneDetector::CloneGroup &OtherGroup) {
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	112	// We have less sequences in the current group than we have in the other,
				113	// so we will never fulfill the requirement for returning true. This is only
				114	// possible because we know that a sequence in Group can contain at most
				115	// one sequence in OtherGroup.
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	116	if (Group.size() < OtherGroup.size())
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	117	return false;
				118
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	119	for (StmtSequence &Stmt : Group) {
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	120	if (!containsAnyInGroup(Stmt, OtherGroup))
				121	return false;
				122	}
				123	return true;
				124	}
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	125
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	126	void OnlyLargestCloneConstraint::constrain(
				127	std::vector<CloneDetector::CloneGroup> &Result) {
Artem Dergachev	ba81632	2016-07-26 18:13:12 +0000	[diff] [blame]	128	std::vector<unsigned> IndexesToRemove;
				129
				130	// Compare every group in the result with the rest. If one groups contains
				131	// another group, we only need to return the bigger group.
				132	// Note: This doesn't scale well, so if possible avoid calling any heavy
				133	// function from this loop to minimize the performance impact.
				134	for (unsigned i = 0; i < Result.size(); ++i) {
				135	for (unsigned j = 0; j < Result.size(); ++j) {
				136	// Don't compare a group with itself.
				137	if (i == j)
				138	continue;
				139
				140	if (containsGroup(Result[j], Result[i])) {
				141	IndexesToRemove.push_back(i);
				142	break;
				143	}
				144	}
				145	}
				146
				147	// Erasing a list of indexes from the vector should be done with decreasing
				148	// indexes. As IndexesToRemove is constructed with increasing values, we just
				149	// reverse iterate over it to get the desired order.
				150	for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) {
				151	Result.erase(Result.begin() + *I);
				152	}
				153	}
Artem Dergachev	2fc1985	2016-08-18 12:29:41 +0000	[diff] [blame]	154
Johannes Altmanninger	1a26769	2017-08-23 16:28:26 +0000	[diff] [blame]	155	bool FilenamePatternConstraint::isAutoGenerated(
				156	const CloneDetector::CloneGroup &Group) {
Leslie Zhai	d91d19e	2017-06-19 01:55:50 +0000	[diff] [blame]	157	std::string Error;
Johannes Altmanninger	1a26769	2017-08-23 16:28:26 +0000	[diff] [blame]	158	if (IgnoredFilesPattern.empty() \|\| Group.empty() \|\|
Leslie Zhai	d91d19e	2017-06-19 01:55:50 +0000	[diff] [blame]	159	!IgnoredFilesRegex->isValid(Error))
				160	return false;
				161
				162	for (const StmtSequence &S : Group) {
				163	const SourceManager &SM = S.getASTContext().getSourceManager();
Johannes Altmanninger	1a26769	2017-08-23 16:28:26 +0000	[diff] [blame]	164	StringRef Filename = llvm::sys::path::filename(
				165	SM.getFilename(S.getContainingDecl()->getLocation()));
Leslie Zhai	d91d19e	2017-06-19 01:55:50 +0000	[diff] [blame]	166	if (IgnoredFilesRegex->match(Filename))
				167	return true;
				168	}
				169
				170	return false;
				171	}
				172
Johannes Altmanninger	1a26769	2017-08-23 16:28:26 +0000	[diff] [blame]	173	/// This class defines what a type II code clone is: If it collects for two
				174	/// statements the same data, then those two statements are considered to be
				175	/// clones of each other.
				176	///
				177	/// All collected data is forwarded to the given data consumer of the type T.
				178	/// The data consumer class needs to provide a member method with the signature:
				179	/// update(StringRef Str)
				180	namespace {
				181	template <class T>
				182	class CloneTypeIIStmtDataCollector
				183	: public ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>> {
				184	ASTContext &Context;
				185	/// The data sink to which all data is forwarded.
				186	T &DataConsumer;
				187
				188	template <class Ty> void addData(const Ty &Data) {
				189	data_collection::addDataToConsumer(DataConsumer, Data);
				190	}
				191
				192	public:
				193	CloneTypeIIStmtDataCollector(const Stmt *S, ASTContext &Context,
				194	T &DataConsumer)
				195	: Context(Context), DataConsumer(DataConsumer) {
				196	this->Visit(S);
				197	}
				198
				199	// Define a visit method for each class to collect data and subsequently visit
				200	// all parent classes. This uses a template so that custom visit methods by us
				201	// take precedence.
				202	#define DEF_ADD_DATA(CLASS, CODE) \
				203	template <class = void> void Visit##CLASS(const CLASS *S) { \
				204	CODE; \
				205	ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \
				206	}
				207
Johannes Altmanninger	1509da0	2017-09-06 13:20:51 +0000	[diff] [blame]	208	#include "clang/AST/StmtDataCollectors.inc"
Johannes Altmanninger	1a26769	2017-08-23 16:28:26 +0000	[diff] [blame]	209
				210	// Type II clones ignore variable names and literals, so let's skip them.
				211	#define SKIP(CLASS) \
				212	void Visit##CLASS(const CLASS *S) { \
				213	ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \
				214	}
				215	SKIP(DeclRefExpr)
				216	SKIP(MemberExpr)
				217	SKIP(IntegerLiteral)
				218	SKIP(FloatingLiteral)
				219	SKIP(StringLiteral)
				220	SKIP(CXXBoolLiteralExpr)
				221	SKIP(CharacterLiteral)
				222	#undef SKIP
				223	};
				224	} // end anonymous namespace
				225
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	226	static size_t createHash(llvm::MD5 &Hash) {
				227	size_t HashCode;
Artem Dergachev	2fc1985	2016-08-18 12:29:41 +0000	[diff] [blame]	228
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	229	// Create the final hash code for the current Stmt.
				230	llvm::MD5::MD5Result HashResult;
				231	Hash.final(HashResult);
Artem Dergachev	2fc1985	2016-08-18 12:29:41 +0000	[diff] [blame]	232
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	233	// Copy as much as possible of the generated hash code to the Stmt's hash
				234	// code.
				235	std::memcpy(&HashCode, &HashResult,
				236	std::min(sizeof(HashCode), sizeof(HashResult)));
Artem Dergachev	2fc1985	2016-08-18 12:29:41 +0000	[diff] [blame]	237
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	238	return HashCode;
				239	}
				240
Raphael Isemann	70686a1	2017-08-31 07:10:46 +0000	[diff] [blame]	241	/// Generates and saves a hash code for the given Stmt.
				242	/// \param S The given Stmt.
				243	/// \param D The Decl containing S.
				244	/// \param StmtsByHash Output parameter that will contain the hash codes for
				245	/// each StmtSequence in the given Stmt.
				246	/// \return The hash code of the given Stmt.
				247	///
				248	/// If the given Stmt is a CompoundStmt, this method will also generate
				249	/// hashes for all possible StmtSequences in the children of this Stmt.
				250	static size_t
				251	saveHash(const Stmt S, const Decl D,
				252	std::vector<std::pair<size_t, StmtSequence>> &StmtsByHash) {
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	253	llvm::MD5 Hash;
				254	ASTContext &Context = D->getASTContext();
				255
Johannes Altmanninger	1a26769	2017-08-23 16:28:26 +0000	[diff] [blame]	256	CloneTypeIIStmtDataCollector<llvm::MD5>(S, Context, Hash);
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	257
				258	auto CS = dyn_cast<CompoundStmt>(S);
				259	SmallVector<size_t, 8> ChildHashes;
				260
				261	for (const Stmt *Child : S->children()) {
				262	if (Child == nullptr) {
				263	ChildHashes.push_back(0);
				264	continue;
				265	}
				266	size_t ChildHash = saveHash(Child, D, StmtsByHash);
				267	Hash.update(
				268	StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash)));
				269	ChildHashes.push_back(ChildHash);
				270	}
				271
				272	if (CS) {
Raphael Isemann	4eac9f0	2017-07-09 21:14:36 +0000	[diff] [blame]	273	// If we're in a CompoundStmt, we hash all possible combinations of child
				274	// statements to find clones in those subsequences.
				275	// We first go through every possible starting position of a subsequence.
				276	for (unsigned Pos = 0; Pos < CS->size(); ++Pos) {
				277	// Then we try all possible lengths this subsequence could have and
				278	// reuse the same hash object to make sure we only hash every child
				279	// hash exactly once.
				280	llvm::MD5 Hash;
				281	for (unsigned Length = 1; Length <= CS->size() - Pos; ++Length) {
				282	// Grab the current child hash and put it into our hash. We do
				283	// -1 on the index because we start counting the length at 1.
				284	size_t ChildHash = ChildHashes[Pos + Length - 1];
				285	Hash.update(
				286	StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash)));
				287	// If we have at least two elements in our subsequence, we can start
				288	// saving it.
				289	if (Length > 1) {
				290	llvm::MD5 SubHash = Hash;
				291	StmtsByHash.push_back(std::make_pair(
				292	createHash(SubHash), StmtSequence(CS, D, Pos, Pos + Length)));
Artem Dergachev	f8b4fc3	2017-04-05 14:17:36 +0000	[diff] [blame]	293	}
Artem Dergachev	f8b4fc3	2017-04-05 14:17:36 +0000	[diff] [blame]	294	}
Artem Dergachev	f8b4fc3	2017-04-05 14:17:36 +0000	[diff] [blame]	295	}
				296	}
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	297
				298	size_t HashCode = createHash(Hash);
				299	StmtsByHash.push_back(std::make_pair(HashCode, StmtSequence(S, D)));
				300	return HashCode;
				301	}
				302
				303	namespace {
				304	/// Wrapper around FoldingSetNodeID that it can be used as the template
				305	/// argument of the StmtDataCollector.
				306	class FoldingSetNodeIDWrapper {
				307
				308	llvm::FoldingSetNodeID &FS;
				309
				310	public:
				311	FoldingSetNodeIDWrapper(llvm::FoldingSetNodeID &FS) : FS(FS) {}
				312
				313	void update(StringRef Str) { FS.AddString(Str); }
				314	};
				315	} // end anonymous namespace
				316
				317	/// Writes the relevant data from all statements and child statements
				318	/// in the given StmtSequence into the given FoldingSetNodeID.
				319	static void CollectStmtSequenceData(const StmtSequence &Sequence,
				320	FoldingSetNodeIDWrapper &OutputData) {
				321	for (const Stmt *S : Sequence) {
Johannes Altmanninger	1a26769	2017-08-23 16:28:26 +0000	[diff] [blame]	322	CloneTypeIIStmtDataCollector<FoldingSetNodeIDWrapper>(
				323	S, Sequence.getASTContext(), OutputData);
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	324
				325	for (const Stmt *Child : S->children()) {
				326	if (!Child)
				327	continue;
				328
				329	CollectStmtSequenceData(StmtSequence(Child, Sequence.getContainingDecl()),
				330	OutputData);
				331	}
				332	}
				333	}
				334
				335	/// Returns true if both sequences are clones of each other.
				336	static bool areSequencesClones(const StmtSequence &LHS,
				337	const StmtSequence &RHS) {
				338	// We collect the data from all statements in the sequence as we did before
				339	// when generating a hash value for each sequence. But this time we don't
				340	// hash the collected data and compare the whole data set instead. This
				341	// prevents any false-positives due to hash code collisions.
				342	llvm::FoldingSetNodeID DataLHS, DataRHS;
				343	FoldingSetNodeIDWrapper LHSWrapper(DataLHS);
				344	FoldingSetNodeIDWrapper RHSWrapper(DataRHS);
				345
				346	CollectStmtSequenceData(LHS, LHSWrapper);
				347	CollectStmtSequenceData(RHS, RHSWrapper);
				348
				349	return DataLHS == DataRHS;
				350	}
				351
Raphael Isemann	70686a1	2017-08-31 07:10:46 +0000	[diff] [blame]	352	void RecursiveCloneTypeIIHashConstraint::constrain(
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	353	std::vector<CloneDetector::CloneGroup> &Sequences) {
				354	// FIXME: Maybe we can do this in-place and don't need this additional vector.
				355	std::vector<CloneDetector::CloneGroup> Result;
				356
				357	for (CloneDetector::CloneGroup &Group : Sequences) {
				358	// We assume in the following code that the Group is non-empty, so we
				359	// skip all empty groups.
				360	if (Group.empty())
				361	continue;
				362
				363	std::vector<std::pair<size_t, StmtSequence>> StmtsByHash;
				364
				365	// Generate hash codes for all children of S and save them in StmtsByHash.
				366	for (const StmtSequence &S : Group) {
				367	saveHash(S.front(), S.getContainingDecl(), StmtsByHash);
				368	}
				369
				370	// Sort hash_codes in StmtsByHash.
				371	std::stable_sort(StmtsByHash.begin(), StmtsByHash.end(),
Ivan Krasin	1e1acbc	2017-04-06 17:42:05 +0000	[diff] [blame]	372	[](std::pair<size_t, StmtSequence> LHS,
Johannes Altmanninger	1a26769	2017-08-23 16:28:26 +0000	[diff] [blame]	373	std::pair<size_t, StmtSequence> RHS) {
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	374	return LHS.first < RHS.first;
				375	});
				376
				377	// Check for each StmtSequence if its successor has the same hash value.
				378	// We don't check the last StmtSequence as it has no successor.
				379	// Note: The 'size - 1 ' in the condition is safe because we check for an
				380	// empty Group vector at the beginning of this function.
				381	for (unsigned i = 0; i < StmtsByHash.size() - 1; ++i) {
				382	const auto Current = StmtsByHash[i];
				383
Alexander Kornienko	2a8c18d	2018-04-06 15:14:32 +0000	[diff] [blame]	384	// It's likely that we just found a sequence of StmtSequences that
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	385	// represent a CloneGroup, so we create a new group and start checking and
				386	// adding the StmtSequences in this sequence.
				387	CloneDetector::CloneGroup NewGroup;
				388
				389	size_t PrototypeHash = Current.first;
				390
				391	for (; i < StmtsByHash.size(); ++i) {
				392	// A different hash value means we have reached the end of the sequence.
Raphael Isemann	70686a1	2017-08-31 07:10:46 +0000	[diff] [blame]	393	if (PrototypeHash != StmtsByHash[i].first) {
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	394	// The current sequence could be the start of a new CloneGroup. So we
				395	// decrement i so that we visit it again in the outer loop.
				396	// Note: i can never be 0 at this point because we are just comparing
				397	// the hash of the Current StmtSequence with itself in the 'if' above.
				398	assert(i != 0);
				399	--i;
				400	break;
				401	}
				402	// Same hash value means we should add the StmtSequence to the current
				403	// group.
				404	NewGroup.push_back(StmtsByHash[i].second);
				405	}
				406
				407	// We created a new clone group with matching hash codes and move it to
				408	// the result vector.
				409	Result.push_back(NewGroup);
				410	}
				411	}
				412	// Sequences is the output parameter, so we copy our result into it.
				413	Sequences = Result;
				414	}
				415
Raphael Isemann	70686a1	2017-08-31 07:10:46 +0000	[diff] [blame]	416	void RecursiveCloneTypeIIVerifyConstraint::constrain(
				417	std::vector<CloneDetector::CloneGroup> &Sequences) {
				418	CloneConstraint::splitCloneGroups(
				419	Sequences, [](const StmtSequence &A, const StmtSequence &B) {
				420	return areSequencesClones(A, B);
				421	});
				422	}
				423
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	424	size_t MinComplexityConstraint::calculateStmtComplexity(
Raphael Isemann	785e816	2017-09-03 13:45:33 +0000	[diff] [blame]	425	const StmtSequence &Seq, std::size_t Limit,
				426	const std::string &ParentMacroStack) {
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	427	if (Seq.empty())
				428	return 0;
				429
				430	size_t Complexity = 1;
				431
				432	ASTContext &Context = Seq.getASTContext();
				433
				434	// Look up what macros expanded into the current statement.
Raphael Isemann	785e816	2017-09-03 13:45:33 +0000	[diff] [blame]	435	std::string MacroStack =
Stephen Kelly	a6e4358	2018-08-09 21:05:56 +0000	[diff] [blame]	436	data_collection::getMacroStack(Seq.getBeginLoc(), Context);
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	437
				438	// First, check if ParentMacroStack is not empty which means we are currently
				439	// dealing with a parent statement which was expanded from a macro.
				440	// If this parent statement was expanded from the same macros as this
				441	// statement, we reduce the initial complexity of this statement to zero.
				442	// This causes that a group of statements that were generated by a single
				443	// macro expansion will only increase the total complexity by one.
				444	// Note: This is not the final complexity of this statement as we still
				445	// add the complexity of the child statements to the complexity value.
Raphael Isemann	785e816	2017-09-03 13:45:33 +0000	[diff] [blame]	446	if (!ParentMacroStack.empty() && MacroStack == ParentMacroStack) {
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	447	Complexity = 0;
				448	}
				449
				450	// Iterate over the Stmts in the StmtSequence and add their complexity values
				451	// to the current complexity value.
				452	if (Seq.holdsSequence()) {
				453	for (const Stmt *S : Seq) {
				454	Complexity += calculateStmtComplexity(
Raphael Isemann	785e816	2017-09-03 13:45:33 +0000	[diff] [blame]	455	StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack);
				456	if (Complexity >= Limit)
				457	return Limit;
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	458	}
				459	} else {
				460	for (const Stmt *S : Seq.front()->children()) {
				461	Complexity += calculateStmtComplexity(
Raphael Isemann	785e816	2017-09-03 13:45:33 +0000	[diff] [blame]	462	StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack);
				463	if (Complexity >= Limit)
				464	return Limit;
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	465	}
				466	}
				467	return Complexity;
				468	}
				469
				470	void MatchingVariablePatternConstraint::constrain(
				471	std::vector<CloneDetector::CloneGroup> &CloneGroups) {
				472	CloneConstraint::splitCloneGroups(
				473	CloneGroups, [](const StmtSequence &A, const StmtSequence &B) {
				474	VariablePattern PatternA(A);
				475	VariablePattern PatternB(B);
				476	return PatternA.countPatternDifferences(PatternB) == 0;
				477	});
				478	}
				479
				480	void CloneConstraint::splitCloneGroups(
				481	std::vector<CloneDetector::CloneGroup> &CloneGroups,
Benjamin Kramer	0b94bfc	2017-09-01 16:51:51 +0000	[diff] [blame]	482	llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
				483	Compare) {
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	484	std::vector<CloneDetector::CloneGroup> Result;
				485	for (auto &HashGroup : CloneGroups) {
				486	// Contains all indexes in HashGroup that were already added to a
				487	// CloneGroup.
				488	std::vector<char> Indexes;
				489	Indexes.resize(HashGroup.size());
				490
				491	for (unsigned i = 0; i < HashGroup.size(); ++i) {
				492	// Skip indexes that are already part of a CloneGroup.
				493	if (Indexes[i])
				494	continue;
				495
				496	// Pick the first unhandled StmtSequence and consider it as the
				497	// beginning
				498	// of a new CloneGroup for now.
				499	// We don't add i to Indexes because we never iterate back.
				500	StmtSequence Prototype = HashGroup[i];
				501	CloneDetector::CloneGroup PotentialGroup = {Prototype};
				502	++Indexes[i];
				503
				504	// Check all following StmtSequences for clones.
				505	for (unsigned j = i + 1; j < HashGroup.size(); ++j) {
				506	// Skip indexes that are already part of a CloneGroup.
				507	if (Indexes[j])
				508	continue;
				509
Raphael Isemann	676b457	2017-06-21 05:41:39 +0000	[diff] [blame]	510	// If a following StmtSequence belongs to our CloneGroup, we add it.
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	511	const StmtSequence &Candidate = HashGroup[j];
				512
				513	if (!Compare(Prototype, Candidate))
				514	continue;
				515
				516	PotentialGroup.push_back(Candidate);
				517	// Make sure we never visit this StmtSequence again.
				518	++Indexes[j];
				519	}
				520
				521	// Otherwise, add it to the result and continue searching for more
				522	// groups.
				523	Result.push_back(PotentialGroup);
				524	}
				525
				526	assert(std::all_of(Indexes.begin(), Indexes.end(),
				527	[](char c) { return c == 1; }));
				528	}
				529	CloneGroups = Result;
				530	}
				531
				532	void VariablePattern::addVariableOccurence(const VarDecl *VarDecl,
				533	const Stmt *Mention) {
				534	// First check if we already reference this variable
				535	for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
				536	if (Variables[KindIndex] == VarDecl) {
Malcolm Parsons	51d3fb0	2018-01-24 10:26:09 +0000	[diff] [blame]	537	// If yes, add a new occurrence that points to the existing entry in
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	538	// the Variables vector.
				539	Occurences.emplace_back(KindIndex, Mention);
				540	return;
				541	}
				542	}
				543	// If this variable wasn't already referenced, add it to the list of
Malcolm Parsons	51d3fb0	2018-01-24 10:26:09 +0000	[diff] [blame]	544	// referenced variables and add a occurrence that points to this new entry.
Artem Dergachev	da9e718	2017-04-06 14:34:07 +0000	[diff] [blame]	545	Occurences.emplace_back(Variables.size(), Mention);
				546	Variables.push_back(VarDecl);
				547	}
				548
				549	void VariablePattern::addVariables(const Stmt *S) {
				550	// Sometimes we get a nullptr (such as from IfStmts which often have nullptr
				551	// children). We skip such statements as they don't reference any
				552	// variables.
				553	if (!S)
				554	return;
				555
				556	// Check if S is a reference to a variable. If yes, add it to the pattern.
				557	if (auto D = dyn_cast<DeclRefExpr>(S)) {
				558	if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))
				559	addVariableOccurence(VD, D);
				560	}
				561
				562	// Recursively check all children of the given statement.
				563	for (const Stmt *Child : S->children()) {
				564	addVariables(Child);
				565	}
				566	}
				567
				568	unsigned VariablePattern::countPatternDifferences(
				569	const VariablePattern &Other,
				570	VariablePattern::SuspiciousClonePair *FirstMismatch) {
				571	unsigned NumberOfDifferences = 0;
				572
				573	assert(Other.Occurences.size() == Occurences.size());
				574	for (unsigned i = 0; i < Occurences.size(); ++i) {
				575	auto ThisOccurence = Occurences[i];
				576	auto OtherOccurence = Other.Occurences[i];
				577	if (ThisOccurence.KindID == OtherOccurence.KindID)
				578	continue;
				579
				580	++NumberOfDifferences;
				581
				582	// If FirstMismatch is not a nullptr, we need to store information about
				583	// the first difference between the two patterns.
				584	if (FirstMismatch == nullptr)
				585	continue;
				586
				587	// Only proceed if we just found the first difference as we only store
				588	// information about the first difference.
				589	if (NumberOfDifferences != 1)
				590	continue;
				591
				592	const VarDecl *FirstSuggestion = nullptr;
				593	// If there is a variable available in the list of referenced variables
				594	// which wouldn't break the pattern if it is used in place of the
				595	// current variable, we provide this variable as the suggested fix.
				596	if (OtherOccurence.KindID < Variables.size())
				597	FirstSuggestion = Variables[OtherOccurence.KindID];
				598
				599	// Store information about the first clone.
				600	FirstMismatch->FirstCloneInfo =
				601	VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo(
				602	Variables[ThisOccurence.KindID], ThisOccurence.Mention,
				603	FirstSuggestion);
				604
				605	// Same as above but with the other clone. We do this for both clones as
				606	// we don't know which clone is the one containing the unintended
				607	// pattern error.
				608	const VarDecl *SecondSuggestion = nullptr;
				609	if (ThisOccurence.KindID < Other.Variables.size())
				610	SecondSuggestion = Other.Variables[ThisOccurence.KindID];
				611
				612	// Store information about the second clone.
				613	FirstMismatch->SecondCloneInfo =
				614	VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo(
				615	Other.Variables[OtherOccurence.KindID], OtherOccurence.Mention,
				616	SecondSuggestion);
				617
				618	// SuspiciousClonePair guarantees that the first clone always has a
				619	// suggested variable associated with it. As we know that one of the two
				620	// clones in the pair always has suggestion, we swap the two clones
				621	// in case the first clone has no suggested variable which means that
				622	// the second clone has a suggested variable and should be first.
				623	if (!FirstMismatch->FirstCloneInfo.Suggestion)
				624	std::swap(FirstMismatch->FirstCloneInfo, FirstMismatch->SecondCloneInfo);
				625
				626	// This ensures that we always have at least one suggestion in a pair.
				627	assert(FirstMismatch->FirstCloneInfo.Suggestion);
				628	}
				629
				630	return NumberOfDifferences;
Artem Dergachev	2fc1985	2016-08-18 12:29:41 +0000	[diff] [blame]	631	}