blob: 038f9eb87cd0f840f8a33e7656aa64fad04f5d30 [file] [log] [blame]
Artem Dergachevba816322016-07-26 18:13:12 +00001//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// This file implements classes for searching and anlyzing source code clones.
11///
12//===----------------------------------------------------------------------===//
13
14#include "clang/Analysis/CloneDetection.h"
15
16#include "clang/AST/ASTContext.h"
17#include "clang/AST/RecursiveASTVisitor.h"
18#include "clang/AST/Stmt.h"
Artem Dergachev78692ea2016-08-02 12:21:09 +000019#include "clang/AST/StmtVisitor.h"
Artem Dergachevba816322016-07-26 18:13:12 +000020#include "llvm/ADT/StringRef.h"
21
22using namespace clang;
23
24StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
25 unsigned StartIndex, unsigned EndIndex)
26 : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
27 assert(Stmt && "Stmt must not be a nullptr");
28 assert(StartIndex < EndIndex && "Given array should not be empty");
29 assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
30}
31
32StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context)
33 : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}
34
35StmtSequence::StmtSequence()
36 : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}
37
38bool StmtSequence::contains(const StmtSequence &Other) const {
39 // If both sequences reside in different translation units, they can never
40 // contain each other.
41 if (Context != Other.Context)
42 return false;
43
44 const SourceManager &SM = Context->getSourceManager();
45
46 // Otherwise check if the start and end locations of the current sequence
47 // surround the other sequence.
48 bool StartIsInBounds =
49 SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
50 getStartLoc() == Other.getStartLoc();
51 if (!StartIsInBounds)
52 return false;
53
54 bool EndIsInBounds =
55 SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
56 Other.getEndLoc() == getEndLoc();
57 return EndIsInBounds;
58}
59
60StmtSequence::iterator StmtSequence::begin() const {
61 if (!holdsSequence()) {
62 return &S;
63 }
64 auto CS = cast<CompoundStmt>(S);
65 return CS->body_begin() + StartIndex;
66}
67
68StmtSequence::iterator StmtSequence::end() const {
69 if (!holdsSequence()) {
Vassil Vassilev5721e0f2016-08-09 10:00:23 +000070 return reinterpret_cast<StmtSequence::iterator>(&S) + 1;
Artem Dergachevba816322016-07-26 18:13:12 +000071 }
72 auto CS = cast<CompoundStmt>(S);
73 return CS->body_begin() + EndIndex;
74}
75
76SourceLocation StmtSequence::getStartLoc() const {
77 return front()->getLocStart();
78}
79
80SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
81
82namespace {
Artem Dergachev7a0088b2016-08-04 19:37:00 +000083
84/// \brief Analyzes the pattern of the referenced variables in a statement.
85class VariablePattern {
86
87 /// \brief Describes an occurence of a variable reference in a statement.
88 struct VariableOccurence {
89 /// The index of the associated VarDecl in the Variables vector.
90 size_t KindID;
91
92 VariableOccurence(size_t KindID) : KindID(KindID) {}
93 };
94
95 /// All occurences of referenced variables in the order of appearance.
96 std::vector<VariableOccurence> Occurences;
97 /// List of referenced variables in the order of appearance.
98 /// Every item in this list is unique.
99 std::vector<const VarDecl *> Variables;
100
101 /// \brief Adds a new variable referenced to this pattern.
102 /// \param VarDecl The declaration of the variable that is referenced.
103 void addVariableOccurence(const VarDecl *VarDecl) {
104 // First check if we already reference this variable
105 for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
106 if (Variables[KindIndex] == VarDecl) {
107 // If yes, add a new occurence that points to the existing entry in
108 // the Variables vector.
109 Occurences.emplace_back(KindIndex);
110 return;
111 }
112 }
113 // If this variable wasn't already referenced, add it to the list of
114 // referenced variables and add a occurence that points to this new entry.
115 Occurences.emplace_back(Variables.size());
116 Variables.push_back(VarDecl);
117 }
118
119 /// \brief Adds each referenced variable from the given statement.
120 void addVariables(const Stmt *S) {
121 // Sometimes we get a nullptr (such as from IfStmts which often have nullptr
122 // children). We skip such statements as they don't reference any
123 // variables.
124 if (!S)
125 return;
126
127 // Check if S is a reference to a variable. If yes, add it to the pattern.
128 if (auto D = dyn_cast<DeclRefExpr>(S)) {
129 if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))
130 addVariableOccurence(VD);
131 }
132
133 // Recursively check all children of the given statement.
134 for (const Stmt *Child : S->children()) {
135 addVariables(Child);
136 }
137 }
138
139public:
140 /// \brief Creates an VariablePattern object with information about the given
141 /// StmtSequence.
142 VariablePattern(const StmtSequence &Sequence) {
143 for (const Stmt *S : Sequence)
144 addVariables(S);
145 }
146
147 /// \brief Compares this pattern with the given one.
148 /// \param Other The given VariablePattern to compare with.
149 /// \return Returns true if and only if the references variables in this
150 /// object follow the same pattern than the ones in the given
151 /// VariablePattern.
152 ///
153 /// For example, the following statements all have the same pattern:
154 ///
155 /// if (a < b) return a; return b;
156 /// if (x < y) return x; return y;
157 /// if (u2 < u1) return u2; return u1;
158 ///
159 /// but the following statement has a different pattern (note the changed
160 /// variables in the return statements).
161 ///
162 /// if (a < b) return b; return a;
163 ///
164 /// This function should only be called if the related statements of the given
165 /// pattern and the statements of this objects are clones of each other.
166 bool comparePattern(const VariablePattern &Other) {
167 assert(Other.Occurences.size() == Occurences.size());
168 for (unsigned i = 0; i < Occurences.size(); ++i) {
169 if (Occurences[i].KindID != Other.Occurences[i].KindID) {
170 return false;
171 }
172 }
173 return true;
174 }
175};
176}
177
178namespace {
Artem Dergachev78692ea2016-08-02 12:21:09 +0000179/// \brief Collects the data of a single Stmt.
180///
181/// This class defines what a code clone is: If it collects for two statements
182/// the same data, then those two statements are considered to be clones of each
183/// other.
184class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector> {
185
186 ASTContext &Context;
187 std::vector<CloneDetector::DataPiece> &CollectedData;
188
189public:
190 /// \brief Collects data of the given Stmt.
191 /// \param S The given statement.
192 /// \param Context The ASTContext of S.
193 /// \param D The given data vector to which all collected data is appended.
194 StmtDataCollector(const Stmt *S, ASTContext &Context,
195 std::vector<CloneDetector::DataPiece> &D)
196 : Context(Context), CollectedData(D) {
197 Visit(S);
198 }
199
200 // Below are utility methods for appending different data to the vector.
201
202 void addData(CloneDetector::DataPiece Integer) {
203 CollectedData.push_back(Integer);
204 }
205
206 // FIXME: The functions below add long strings to the data vector which are
207 // probably not good for performance. Replace the strings with pointer values
208 // or a some other unique integer.
209
210 void addData(llvm::StringRef Str) {
211 if (Str.empty())
212 return;
213
214 const size_t OldSize = CollectedData.size();
215
216 const size_t PieceSize = sizeof(CloneDetector::DataPiece);
217 // Calculate how many vector units we need to accomodate all string bytes.
218 size_t RoundedUpPieceNumber = (Str.size() + PieceSize - 1) / PieceSize;
219 // Allocate space for the string in the data vector.
220 CollectedData.resize(CollectedData.size() + RoundedUpPieceNumber);
221
222 // Copy the string to the allocated space at the end of the vector.
223 std::memcpy(CollectedData.data() + OldSize, Str.data(), Str.size());
224 }
225
226 void addData(const QualType &QT) { addData(QT.getAsString()); }
227
228// The functions below collect the class specific data of each Stmt subclass.
229
230// Utility macro for defining a visit method for a given class. This method
231// calls back to the ConstStmtVisitor to visit all parent classes.
232#define DEF_ADD_DATA(CLASS, CODE) \
233 void Visit##CLASS(const CLASS *S) { \
234 CODE; \
235 ConstStmtVisitor<StmtDataCollector>::Visit##CLASS(S); \
236 }
237
238 DEF_ADD_DATA(Stmt, { addData(S->getStmtClass()); })
239 DEF_ADD_DATA(Expr, { addData(S->getType()); })
240
241 //--- Builtin functionality ----------------------------------------------//
242 DEF_ADD_DATA(ArrayTypeTraitExpr, { addData(S->getTrait()); })
243 DEF_ADD_DATA(ExpressionTraitExpr, { addData(S->getTrait()); })
244 DEF_ADD_DATA(PredefinedExpr, { addData(S->getIdentType()); })
245 DEF_ADD_DATA(TypeTraitExpr, {
246 addData(S->getTrait());
247 for (unsigned i = 0; i < S->getNumArgs(); ++i)
248 addData(S->getArg(i)->getType());
249 })
250
251 //--- Calls --------------------------------------------------------------//
252 DEF_ADD_DATA(CallExpr,
253 { addData(S->getDirectCallee()->getQualifiedNameAsString()); })
254
255 //--- Exceptions ---------------------------------------------------------//
256 DEF_ADD_DATA(CXXCatchStmt, { addData(S->getCaughtType()); })
257
258 //--- C++ OOP Stmts ------------------------------------------------------//
259 DEF_ADD_DATA(CXXDeleteExpr, {
260 addData(S->isArrayFormAsWritten());
261 addData(S->isGlobalDelete());
262 })
263
264 //--- Casts --------------------------------------------------------------//
265 DEF_ADD_DATA(ObjCBridgedCastExpr, { addData(S->getBridgeKind()); })
266
267 //--- Miscellaneous Exprs ------------------------------------------------//
268 DEF_ADD_DATA(BinaryOperator, { addData(S->getOpcode()); })
269 DEF_ADD_DATA(UnaryOperator, { addData(S->getOpcode()); })
270
271 //--- Control flow -------------------------------------------------------//
272 DEF_ADD_DATA(GotoStmt, { addData(S->getLabel()->getName()); })
273 DEF_ADD_DATA(IndirectGotoStmt, {
274 if (S->getConstantTarget())
275 addData(S->getConstantTarget()->getName());
276 })
277 DEF_ADD_DATA(LabelStmt, { addData(S->getDecl()->getName()); })
278 DEF_ADD_DATA(MSDependentExistsStmt, { addData(S->isIfExists()); })
279 DEF_ADD_DATA(AddrLabelExpr, { addData(S->getLabel()->getName()); })
280
281 //--- Objective-C --------------------------------------------------------//
282 DEF_ADD_DATA(ObjCIndirectCopyRestoreExpr, { addData(S->shouldCopy()); })
283 DEF_ADD_DATA(ObjCPropertyRefExpr, {
284 addData(S->isSuperReceiver());
285 addData(S->isImplicitProperty());
286 })
287 DEF_ADD_DATA(ObjCAtCatchStmt, { addData(S->hasEllipsis()); })
288
289 //--- Miscellaneous Stmts ------------------------------------------------//
290 DEF_ADD_DATA(CXXFoldExpr, {
291 addData(S->isRightFold());
292 addData(S->getOperator());
293 })
294 DEF_ADD_DATA(GenericSelectionExpr, {
295 for (unsigned i = 0; i < S->getNumAssocs(); ++i) {
296 addData(S->getAssocType(i));
297 }
298 })
299 DEF_ADD_DATA(LambdaExpr, {
300 for (const LambdaCapture &C : S->captures()) {
301 addData(C.isPackExpansion());
302 addData(C.getCaptureKind());
303 if (C.capturesVariable())
304 addData(C.getCapturedVar()->getType());
305 }
306 addData(S->isGenericLambda());
307 addData(S->isMutable());
308 })
309 DEF_ADD_DATA(DeclStmt, {
310 auto numDecls = std::distance(S->decl_begin(), S->decl_end());
311 addData(static_cast<CloneDetector::DataPiece>(numDecls));
312 for (const Decl *D : S->decls()) {
313 if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
314 addData(VD->getType());
315 }
316 }
317 })
318 DEF_ADD_DATA(AsmStmt, {
319 addData(S->isSimple());
320 addData(S->isVolatile());
321 addData(S->generateAsmString(Context));
322 for (unsigned i = 0; i < S->getNumInputs(); ++i) {
323 addData(S->getInputConstraint(i));
324 }
325 for (unsigned i = 0; i < S->getNumOutputs(); ++i) {
326 addData(S->getOutputConstraint(i));
327 }
328 for (unsigned i = 0; i < S->getNumClobbers(); ++i) {
329 addData(S->getClobber(i));
330 }
331 })
332 DEF_ADD_DATA(AttributedStmt, {
333 for (const Attr *A : S->getAttrs()) {
334 addData(std::string(A->getSpelling()));
335 }
336 })
337};
338} // end anonymous namespace
339
340namespace {
Artem Dergachevba816322016-07-26 18:13:12 +0000341/// Generates CloneSignatures for a set of statements and stores the results in
342/// a CloneDetector object.
343class CloneSignatureGenerator {
344
345 CloneDetector &CD;
346 ASTContext &Context;
347
348 /// \brief Generates CloneSignatures for all statements in the given statement
349 /// tree and stores them in the CloneDetector.
350 ///
351 /// \param S The root of the given statement tree.
352 /// \return The CloneSignature of the root statement.
353 CloneDetector::CloneSignature generateSignatures(const Stmt *S) {
354 // Create an empty signature that will be filled in this method.
355 CloneDetector::CloneSignature Signature;
356
Artem Dergachev78692ea2016-08-02 12:21:09 +0000357 // Collect all relevant data from S and put it into the empty signature.
358 StmtDataCollector(S, Context, Signature.Data);
Artem Dergachevba816322016-07-26 18:13:12 +0000359
360 // Storage for the signatures of the direct child statements. This is only
361 // needed if the current statement is a CompoundStmt.
362 std::vector<CloneDetector::CloneSignature> ChildSignatures;
363 const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S);
364
365 // The signature of a statement includes the signatures of its children.
366 // Therefore we create the signatures for every child and add them to the
367 // current signature.
368 for (const Stmt *Child : S->children()) {
369 // Some statements like 'if' can have nullptr children that we will skip.
370 if (!Child)
371 continue;
372
373 // Recursive call to create the signature of the child statement. This
374 // will also create and store all clone groups in this child statement.
375 auto ChildSignature = generateSignatures(Child);
376
377 // Add the collected data to the signature of the current statement.
378 Signature.add(ChildSignature);
379
380 // If the current statement is a CompoundStatement, we need to store the
381 // signature for the generation of the sub-sequences.
382 if (CS)
383 ChildSignatures.push_back(ChildSignature);
384 }
385
386 // If the current statement is a CompoundStmt, we also need to create the
387 // clone groups from the sub-sequences inside the children.
388 if (CS)
389 handleSubSequences(CS, ChildSignatures);
390
391 // Save the signature for the current statement in the CloneDetector object.
392 CD.add(StmtSequence(S, Context), Signature);
393
394 return Signature;
395 }
396
397 /// \brief Adds all possible sub-sequences in the child array of the given
398 /// CompoundStmt to the CloneDetector.
399 /// \param CS The given CompoundStmt.
400 /// \param ChildSignatures A list of calculated signatures for each child in
401 /// the given CompoundStmt.
402 void handleSubSequences(
403 const CompoundStmt *CS,
404 const std::vector<CloneDetector::CloneSignature> &ChildSignatures) {
405
406 // FIXME: This function has quadratic runtime right now. Check if skipping
407 // this function for too long CompoundStmts is an option.
408
409 // The length of the sub-sequence. We don't need to handle sequences with
410 // the length 1 as they are already handled in CollectData().
411 for (unsigned Length = 2; Length <= CS->size(); ++Length) {
412 // The start index in the body of the CompoundStmt. We increase the
413 // position until the end of the sub-sequence reaches the end of the
414 // CompoundStmt body.
415 for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
416 // Create an empty signature and add the signatures of all selected
417 // child statements to it.
418 CloneDetector::CloneSignature SubSignature;
419
420 for (unsigned i = Pos; i < Pos + Length; ++i) {
421 SubSignature.add(ChildSignatures[i]);
422 }
423
424 // Save the signature together with the information about what children
425 // sequence we selected.
426 CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
427 }
428 }
429 }
430
431public:
432 explicit CloneSignatureGenerator(CloneDetector &CD, ASTContext &Context)
433 : CD(CD), Context(Context) {}
434
435 /// \brief Generates signatures for all statements in the given function body.
436 void consumeCodeBody(const Stmt *S) { generateSignatures(S); }
437};
438} // end anonymous namespace
439
440void CloneDetector::analyzeCodeBody(const Decl *D) {
441 assert(D);
442 assert(D->hasBody());
443 CloneSignatureGenerator Generator(*this, D->getASTContext());
444 Generator.consumeCodeBody(D->getBody());
445}
446
447void CloneDetector::add(const StmtSequence &S,
448 const CloneSignature &Signature) {
449 // StringMap only works with StringRefs, so we create one for our data vector.
450 auto &Data = Signature.Data;
451 StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
452 Data.size() * sizeof(unsigned));
453
454 // Search with the help of the signature if we already have encountered a
455 // clone of the given StmtSequence.
456 auto I = CloneGroupIndexes.find(DataRef);
457 if (I == CloneGroupIndexes.end()) {
458 // We haven't found an existing clone group, so we create a new clone group
459 // for this StmtSequence and store the index of it in our search map.
460 CloneGroupIndexes[DataRef] = CloneGroups.size();
461 CloneGroups.emplace_back(S, Signature.Complexity);
462 return;
463 }
464
465 // We have found an existing clone group and can expand it with the given
466 // StmtSequence.
467 CloneGroups[I->getValue()].Sequences.push_back(S);
468}
469
470namespace {
471/// \brief Returns true if and only if \p Stmt contains at least one other
472/// sequence in the \p Group.
Artem Dergachev7a0088b2016-08-04 19:37:00 +0000473bool containsAnyInGroup(StmtSequence &Stmt, CloneDetector::CloneGroup &Group) {
Artem Dergachevba816322016-07-26 18:13:12 +0000474 for (StmtSequence &GroupStmt : Group.Sequences) {
475 if (Stmt.contains(GroupStmt))
476 return true;
477 }
478 return false;
479}
480
481/// \brief Returns true if and only if all sequences in \p OtherGroup are
482/// contained by a sequence in \p Group.
483bool containsGroup(CloneDetector::CloneGroup &Group,
484 CloneDetector::CloneGroup &OtherGroup) {
485 // We have less sequences in the current group than we have in the other,
486 // so we will never fulfill the requirement for returning true. This is only
487 // possible because we know that a sequence in Group can contain at most
488 // one sequence in OtherGroup.
489 if (Group.Sequences.size() < OtherGroup.Sequences.size())
490 return false;
491
492 for (StmtSequence &Stmt : Group.Sequences) {
493 if (!containsAnyInGroup(Stmt, OtherGroup))
494 return false;
495 }
496 return true;
497}
498} // end anonymous namespace
499
Artem Dergachev7a0088b2016-08-04 19:37:00 +0000500/// \brief Finds all actual clone groups in a single group of presumed clones.
501/// \param Result Output parameter to which all found groups are added. Every
502/// clone in a group that was added this way follows the same
503/// variable pattern as the other clones in its group.
504/// \param Group A group of clones. The clones are allowed to have a different
505/// variable pattern.
506static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
507 const CloneDetector::CloneGroup &Group) {
508 // We remove the Sequences one by one, so a list is more appropriate.
509 std::list<StmtSequence> UnassignedSequences(Group.Sequences.begin(),
510 Group.Sequences.end());
511
512 // Search for clones as long as there could be clones in UnassignedSequences.
513 while (UnassignedSequences.size() > 1) {
514
515 // Pick the first Sequence as a protoype for a new clone group.
516 StmtSequence Prototype = UnassignedSequences.front();
517 UnassignedSequences.pop_front();
518
519 CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Complexity);
520
521 // Analyze the variable pattern of the prototype. Every other StmtSequence
522 // needs to have the same pattern to get into the new clone group.
523 VariablePattern PrototypeFeatures(Prototype);
524
525 // Search all remaining StmtSequences for an identical variable pattern
526 // and assign them to our new clone group.
527 auto I = UnassignedSequences.begin(), E = UnassignedSequences.end();
528 while (I != E) {
529 if (VariablePattern(*I).comparePattern(PrototypeFeatures)) {
530 FilteredGroup.Sequences.push_back(*I);
531 I = UnassignedSequences.erase(I);
532 continue;
533 }
534 ++I;
535 }
536
537 // Add a valid clone group to the list of found clone groups.
538 if (!FilteredGroup.isValid())
539 continue;
540
541 Result.push_back(FilteredGroup);
542 }
543}
544
Artem Dergachevba816322016-07-26 18:13:12 +0000545void CloneDetector::findClones(std::vector<CloneGroup> &Result,
546 unsigned MinGroupComplexity) {
547 // Add every valid clone group that fulfills the complexity requirement.
548 for (const CloneGroup &Group : CloneGroups) {
549 if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
Artem Dergachev7a0088b2016-08-04 19:37:00 +0000550 createCloneGroups(Result, Group);
Artem Dergachevba816322016-07-26 18:13:12 +0000551 }
552 }
553
554 std::vector<unsigned> IndexesToRemove;
555
556 // Compare every group in the result with the rest. If one groups contains
557 // another group, we only need to return the bigger group.
558 // Note: This doesn't scale well, so if possible avoid calling any heavy
559 // function from this loop to minimize the performance impact.
560 for (unsigned i = 0; i < Result.size(); ++i) {
561 for (unsigned j = 0; j < Result.size(); ++j) {
562 // Don't compare a group with itself.
563 if (i == j)
564 continue;
565
566 if (containsGroup(Result[j], Result[i])) {
567 IndexesToRemove.push_back(i);
568 break;
569 }
570 }
571 }
572
573 // Erasing a list of indexes from the vector should be done with decreasing
574 // indexes. As IndexesToRemove is constructed with increasing values, we just
575 // reverse iterate over it to get the desired order.
576 for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) {
577 Result.erase(Result.begin() + *I);
578 }
579}