blob: 0d3926ebd36de304e547d2d10af93212a5c2b74c [file] [log] [blame]
Artem Dergachevba816322016-07-26 18:13:12 +00001//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// This file implements classes for searching and anlyzing source code clones.
11///
12//===----------------------------------------------------------------------===//
13
14#include "clang/Analysis/CloneDetection.h"
15
16#include "clang/AST/ASTContext.h"
17#include "clang/AST/RecursiveASTVisitor.h"
18#include "clang/AST/Stmt.h"
19#include "llvm/ADT/StringRef.h"
20
21using namespace clang;
22
23StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
24 unsigned StartIndex, unsigned EndIndex)
25 : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
26 assert(Stmt && "Stmt must not be a nullptr");
27 assert(StartIndex < EndIndex && "Given array should not be empty");
28 assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
29}
30
31StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context)
32 : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}
33
34StmtSequence::StmtSequence()
35 : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}
36
37bool StmtSequence::contains(const StmtSequence &Other) const {
38 // If both sequences reside in different translation units, they can never
39 // contain each other.
40 if (Context != Other.Context)
41 return false;
42
43 const SourceManager &SM = Context->getSourceManager();
44
45 // Otherwise check if the start and end locations of the current sequence
46 // surround the other sequence.
47 bool StartIsInBounds =
48 SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
49 getStartLoc() == Other.getStartLoc();
50 if (!StartIsInBounds)
51 return false;
52
53 bool EndIsInBounds =
54 SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
55 Other.getEndLoc() == getEndLoc();
56 return EndIsInBounds;
57}
58
59StmtSequence::iterator StmtSequence::begin() const {
60 if (!holdsSequence()) {
61 return &S;
62 }
63 auto CS = cast<CompoundStmt>(S);
64 return CS->body_begin() + StartIndex;
65}
66
67StmtSequence::iterator StmtSequence::end() const {
68 if (!holdsSequence()) {
69 return &S + 1;
70 }
71 auto CS = cast<CompoundStmt>(S);
72 return CS->body_begin() + EndIndex;
73}
74
75SourceLocation StmtSequence::getStartLoc() const {
76 return front()->getLocStart();
77}
78
79SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
80
81namespace {
82/// Generates CloneSignatures for a set of statements and stores the results in
83/// a CloneDetector object.
84class CloneSignatureGenerator {
85
86 CloneDetector &CD;
87 ASTContext &Context;
88
89 /// \brief Generates CloneSignatures for all statements in the given statement
90 /// tree and stores them in the CloneDetector.
91 ///
92 /// \param S The root of the given statement tree.
93 /// \return The CloneSignature of the root statement.
94 CloneDetector::CloneSignature generateSignatures(const Stmt *S) {
95 // Create an empty signature that will be filled in this method.
96 CloneDetector::CloneSignature Signature;
97
98 // The only relevant data for now is the class of the statement.
99 // TODO: Collect statement class specific data.
100 Signature.Data.push_back(S->getStmtClass());
101
102 // Storage for the signatures of the direct child statements. This is only
103 // needed if the current statement is a CompoundStmt.
104 std::vector<CloneDetector::CloneSignature> ChildSignatures;
105 const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S);
106
107 // The signature of a statement includes the signatures of its children.
108 // Therefore we create the signatures for every child and add them to the
109 // current signature.
110 for (const Stmt *Child : S->children()) {
111 // Some statements like 'if' can have nullptr children that we will skip.
112 if (!Child)
113 continue;
114
115 // Recursive call to create the signature of the child statement. This
116 // will also create and store all clone groups in this child statement.
117 auto ChildSignature = generateSignatures(Child);
118
119 // Add the collected data to the signature of the current statement.
120 Signature.add(ChildSignature);
121
122 // If the current statement is a CompoundStatement, we need to store the
123 // signature for the generation of the sub-sequences.
124 if (CS)
125 ChildSignatures.push_back(ChildSignature);
126 }
127
128 // If the current statement is a CompoundStmt, we also need to create the
129 // clone groups from the sub-sequences inside the children.
130 if (CS)
131 handleSubSequences(CS, ChildSignatures);
132
133 // Save the signature for the current statement in the CloneDetector object.
134 CD.add(StmtSequence(S, Context), Signature);
135
136 return Signature;
137 }
138
139 /// \brief Adds all possible sub-sequences in the child array of the given
140 /// CompoundStmt to the CloneDetector.
141 /// \param CS The given CompoundStmt.
142 /// \param ChildSignatures A list of calculated signatures for each child in
143 /// the given CompoundStmt.
144 void handleSubSequences(
145 const CompoundStmt *CS,
146 const std::vector<CloneDetector::CloneSignature> &ChildSignatures) {
147
148 // FIXME: This function has quadratic runtime right now. Check if skipping
149 // this function for too long CompoundStmts is an option.
150
151 // The length of the sub-sequence. We don't need to handle sequences with
152 // the length 1 as they are already handled in CollectData().
153 for (unsigned Length = 2; Length <= CS->size(); ++Length) {
154 // The start index in the body of the CompoundStmt. We increase the
155 // position until the end of the sub-sequence reaches the end of the
156 // CompoundStmt body.
157 for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
158 // Create an empty signature and add the signatures of all selected
159 // child statements to it.
160 CloneDetector::CloneSignature SubSignature;
161
162 for (unsigned i = Pos; i < Pos + Length; ++i) {
163 SubSignature.add(ChildSignatures[i]);
164 }
165
166 // Save the signature together with the information about what children
167 // sequence we selected.
168 CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
169 }
170 }
171 }
172
173public:
174 explicit CloneSignatureGenerator(CloneDetector &CD, ASTContext &Context)
175 : CD(CD), Context(Context) {}
176
177 /// \brief Generates signatures for all statements in the given function body.
178 void consumeCodeBody(const Stmt *S) { generateSignatures(S); }
179};
180} // end anonymous namespace
181
182void CloneDetector::analyzeCodeBody(const Decl *D) {
183 assert(D);
184 assert(D->hasBody());
185 CloneSignatureGenerator Generator(*this, D->getASTContext());
186 Generator.consumeCodeBody(D->getBody());
187}
188
189void CloneDetector::add(const StmtSequence &S,
190 const CloneSignature &Signature) {
191 // StringMap only works with StringRefs, so we create one for our data vector.
192 auto &Data = Signature.Data;
193 StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
194 Data.size() * sizeof(unsigned));
195
196 // Search with the help of the signature if we already have encountered a
197 // clone of the given StmtSequence.
198 auto I = CloneGroupIndexes.find(DataRef);
199 if (I == CloneGroupIndexes.end()) {
200 // We haven't found an existing clone group, so we create a new clone group
201 // for this StmtSequence and store the index of it in our search map.
202 CloneGroupIndexes[DataRef] = CloneGroups.size();
203 CloneGroups.emplace_back(S, Signature.Complexity);
204 return;
205 }
206
207 // We have found an existing clone group and can expand it with the given
208 // StmtSequence.
209 CloneGroups[I->getValue()].Sequences.push_back(S);
210}
211
212namespace {
213/// \brief Returns true if and only if \p Stmt contains at least one other
214/// sequence in the \p Group.
215bool containsAnyInGroup(StmtSequence &Stmt,
216 CloneDetector::CloneGroup &Group) {
217 for (StmtSequence &GroupStmt : Group.Sequences) {
218 if (Stmt.contains(GroupStmt))
219 return true;
220 }
221 return false;
222}
223
224/// \brief Returns true if and only if all sequences in \p OtherGroup are
225/// contained by a sequence in \p Group.
226bool containsGroup(CloneDetector::CloneGroup &Group,
227 CloneDetector::CloneGroup &OtherGroup) {
228 // We have less sequences in the current group than we have in the other,
229 // so we will never fulfill the requirement for returning true. This is only
230 // possible because we know that a sequence in Group can contain at most
231 // one sequence in OtherGroup.
232 if (Group.Sequences.size() < OtherGroup.Sequences.size())
233 return false;
234
235 for (StmtSequence &Stmt : Group.Sequences) {
236 if (!containsAnyInGroup(Stmt, OtherGroup))
237 return false;
238 }
239 return true;
240}
241} // end anonymous namespace
242
243void CloneDetector::findClones(std::vector<CloneGroup> &Result,
244 unsigned MinGroupComplexity) {
245 // Add every valid clone group that fulfills the complexity requirement.
246 for (const CloneGroup &Group : CloneGroups) {
247 if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
248 Result.push_back(Group);
249 }
250 }
251
252 std::vector<unsigned> IndexesToRemove;
253
254 // Compare every group in the result with the rest. If one groups contains
255 // another group, we only need to return the bigger group.
256 // Note: This doesn't scale well, so if possible avoid calling any heavy
257 // function from this loop to minimize the performance impact.
258 for (unsigned i = 0; i < Result.size(); ++i) {
259 for (unsigned j = 0; j < Result.size(); ++j) {
260 // Don't compare a group with itself.
261 if (i == j)
262 continue;
263
264 if (containsGroup(Result[j], Result[i])) {
265 IndexesToRemove.push_back(i);
266 break;
267 }
268 }
269 }
270
271 // Erasing a list of indexes from the vector should be done with decreasing
272 // indexes. As IndexesToRemove is constructed with increasing values, we just
273 // reverse iterate over it to get the desired order.
274 for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) {
275 Result.erase(Result.begin() + *I);
276 }
277}