Artem Dergachev | ba81632 | 2016-07-26 18:13:12 +0000 | [diff] [blame^] | 1 | //===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | /// |
| 10 | /// This file implements classes for searching and anlyzing source code clones. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "clang/Analysis/CloneDetection.h" |
| 15 | |
| 16 | #include "clang/AST/ASTContext.h" |
| 17 | #include "clang/AST/RecursiveASTVisitor.h" |
| 18 | #include "clang/AST/Stmt.h" |
| 19 | #include "llvm/ADT/StringRef.h" |
| 20 | |
| 21 | using namespace clang; |
| 22 | |
| 23 | StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context, |
| 24 | unsigned StartIndex, unsigned EndIndex) |
| 25 | : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) { |
| 26 | assert(Stmt && "Stmt must not be a nullptr"); |
| 27 | assert(StartIndex < EndIndex && "Given array should not be empty"); |
| 28 | assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt"); |
| 29 | } |
| 30 | |
| 31 | StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context) |
| 32 | : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {} |
| 33 | |
| 34 | StmtSequence::StmtSequence() |
| 35 | : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {} |
| 36 | |
| 37 | bool StmtSequence::contains(const StmtSequence &Other) const { |
| 38 | // If both sequences reside in different translation units, they can never |
| 39 | // contain each other. |
| 40 | if (Context != Other.Context) |
| 41 | return false; |
| 42 | |
| 43 | const SourceManager &SM = Context->getSourceManager(); |
| 44 | |
| 45 | // Otherwise check if the start and end locations of the current sequence |
| 46 | // surround the other sequence. |
| 47 | bool StartIsInBounds = |
| 48 | SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) || |
| 49 | getStartLoc() == Other.getStartLoc(); |
| 50 | if (!StartIsInBounds) |
| 51 | return false; |
| 52 | |
| 53 | bool EndIsInBounds = |
| 54 | SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) || |
| 55 | Other.getEndLoc() == getEndLoc(); |
| 56 | return EndIsInBounds; |
| 57 | } |
| 58 | |
| 59 | StmtSequence::iterator StmtSequence::begin() const { |
| 60 | if (!holdsSequence()) { |
| 61 | return &S; |
| 62 | } |
| 63 | auto CS = cast<CompoundStmt>(S); |
| 64 | return CS->body_begin() + StartIndex; |
| 65 | } |
| 66 | |
| 67 | StmtSequence::iterator StmtSequence::end() const { |
| 68 | if (!holdsSequence()) { |
| 69 | return &S + 1; |
| 70 | } |
| 71 | auto CS = cast<CompoundStmt>(S); |
| 72 | return CS->body_begin() + EndIndex; |
| 73 | } |
| 74 | |
| 75 | SourceLocation StmtSequence::getStartLoc() const { |
| 76 | return front()->getLocStart(); |
| 77 | } |
| 78 | |
| 79 | SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); } |
| 80 | |
| 81 | namespace { |
| 82 | /// Generates CloneSignatures for a set of statements and stores the results in |
| 83 | /// a CloneDetector object. |
| 84 | class CloneSignatureGenerator { |
| 85 | |
| 86 | CloneDetector &CD; |
| 87 | ASTContext &Context; |
| 88 | |
| 89 | /// \brief Generates CloneSignatures for all statements in the given statement |
| 90 | /// tree and stores them in the CloneDetector. |
| 91 | /// |
| 92 | /// \param S The root of the given statement tree. |
| 93 | /// \return The CloneSignature of the root statement. |
| 94 | CloneDetector::CloneSignature generateSignatures(const Stmt *S) { |
| 95 | // Create an empty signature that will be filled in this method. |
| 96 | CloneDetector::CloneSignature Signature; |
| 97 | |
| 98 | // The only relevant data for now is the class of the statement. |
| 99 | // TODO: Collect statement class specific data. |
| 100 | Signature.Data.push_back(S->getStmtClass()); |
| 101 | |
| 102 | // Storage for the signatures of the direct child statements. This is only |
| 103 | // needed if the current statement is a CompoundStmt. |
| 104 | std::vector<CloneDetector::CloneSignature> ChildSignatures; |
| 105 | const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S); |
| 106 | |
| 107 | // The signature of a statement includes the signatures of its children. |
| 108 | // Therefore we create the signatures for every child and add them to the |
| 109 | // current signature. |
| 110 | for (const Stmt *Child : S->children()) { |
| 111 | // Some statements like 'if' can have nullptr children that we will skip. |
| 112 | if (!Child) |
| 113 | continue; |
| 114 | |
| 115 | // Recursive call to create the signature of the child statement. This |
| 116 | // will also create and store all clone groups in this child statement. |
| 117 | auto ChildSignature = generateSignatures(Child); |
| 118 | |
| 119 | // Add the collected data to the signature of the current statement. |
| 120 | Signature.add(ChildSignature); |
| 121 | |
| 122 | // If the current statement is a CompoundStatement, we need to store the |
| 123 | // signature for the generation of the sub-sequences. |
| 124 | if (CS) |
| 125 | ChildSignatures.push_back(ChildSignature); |
| 126 | } |
| 127 | |
| 128 | // If the current statement is a CompoundStmt, we also need to create the |
| 129 | // clone groups from the sub-sequences inside the children. |
| 130 | if (CS) |
| 131 | handleSubSequences(CS, ChildSignatures); |
| 132 | |
| 133 | // Save the signature for the current statement in the CloneDetector object. |
| 134 | CD.add(StmtSequence(S, Context), Signature); |
| 135 | |
| 136 | return Signature; |
| 137 | } |
| 138 | |
| 139 | /// \brief Adds all possible sub-sequences in the child array of the given |
| 140 | /// CompoundStmt to the CloneDetector. |
| 141 | /// \param CS The given CompoundStmt. |
| 142 | /// \param ChildSignatures A list of calculated signatures for each child in |
| 143 | /// the given CompoundStmt. |
| 144 | void handleSubSequences( |
| 145 | const CompoundStmt *CS, |
| 146 | const std::vector<CloneDetector::CloneSignature> &ChildSignatures) { |
| 147 | |
| 148 | // FIXME: This function has quadratic runtime right now. Check if skipping |
| 149 | // this function for too long CompoundStmts is an option. |
| 150 | |
| 151 | // The length of the sub-sequence. We don't need to handle sequences with |
| 152 | // the length 1 as they are already handled in CollectData(). |
| 153 | for (unsigned Length = 2; Length <= CS->size(); ++Length) { |
| 154 | // The start index in the body of the CompoundStmt. We increase the |
| 155 | // position until the end of the sub-sequence reaches the end of the |
| 156 | // CompoundStmt body. |
| 157 | for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) { |
| 158 | // Create an empty signature and add the signatures of all selected |
| 159 | // child statements to it. |
| 160 | CloneDetector::CloneSignature SubSignature; |
| 161 | |
| 162 | for (unsigned i = Pos; i < Pos + Length; ++i) { |
| 163 | SubSignature.add(ChildSignatures[i]); |
| 164 | } |
| 165 | |
| 166 | // Save the signature together with the information about what children |
| 167 | // sequence we selected. |
| 168 | CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature); |
| 169 | } |
| 170 | } |
| 171 | } |
| 172 | |
| 173 | public: |
| 174 | explicit CloneSignatureGenerator(CloneDetector &CD, ASTContext &Context) |
| 175 | : CD(CD), Context(Context) {} |
| 176 | |
| 177 | /// \brief Generates signatures for all statements in the given function body. |
| 178 | void consumeCodeBody(const Stmt *S) { generateSignatures(S); } |
| 179 | }; |
| 180 | } // end anonymous namespace |
| 181 | |
| 182 | void CloneDetector::analyzeCodeBody(const Decl *D) { |
| 183 | assert(D); |
| 184 | assert(D->hasBody()); |
| 185 | CloneSignatureGenerator Generator(*this, D->getASTContext()); |
| 186 | Generator.consumeCodeBody(D->getBody()); |
| 187 | } |
| 188 | |
| 189 | void CloneDetector::add(const StmtSequence &S, |
| 190 | const CloneSignature &Signature) { |
| 191 | // StringMap only works with StringRefs, so we create one for our data vector. |
| 192 | auto &Data = Signature.Data; |
| 193 | StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()), |
| 194 | Data.size() * sizeof(unsigned)); |
| 195 | |
| 196 | // Search with the help of the signature if we already have encountered a |
| 197 | // clone of the given StmtSequence. |
| 198 | auto I = CloneGroupIndexes.find(DataRef); |
| 199 | if (I == CloneGroupIndexes.end()) { |
| 200 | // We haven't found an existing clone group, so we create a new clone group |
| 201 | // for this StmtSequence and store the index of it in our search map. |
| 202 | CloneGroupIndexes[DataRef] = CloneGroups.size(); |
| 203 | CloneGroups.emplace_back(S, Signature.Complexity); |
| 204 | return; |
| 205 | } |
| 206 | |
| 207 | // We have found an existing clone group and can expand it with the given |
| 208 | // StmtSequence. |
| 209 | CloneGroups[I->getValue()].Sequences.push_back(S); |
| 210 | } |
| 211 | |
| 212 | namespace { |
| 213 | /// \brief Returns true if and only if \p Stmt contains at least one other |
| 214 | /// sequence in the \p Group. |
| 215 | bool containsAnyInGroup(StmtSequence &Stmt, |
| 216 | CloneDetector::CloneGroup &Group) { |
| 217 | for (StmtSequence &GroupStmt : Group.Sequences) { |
| 218 | if (Stmt.contains(GroupStmt)) |
| 219 | return true; |
| 220 | } |
| 221 | return false; |
| 222 | } |
| 223 | |
| 224 | /// \brief Returns true if and only if all sequences in \p OtherGroup are |
| 225 | /// contained by a sequence in \p Group. |
| 226 | bool containsGroup(CloneDetector::CloneGroup &Group, |
| 227 | CloneDetector::CloneGroup &OtherGroup) { |
| 228 | // We have less sequences in the current group than we have in the other, |
| 229 | // so we will never fulfill the requirement for returning true. This is only |
| 230 | // possible because we know that a sequence in Group can contain at most |
| 231 | // one sequence in OtherGroup. |
| 232 | if (Group.Sequences.size() < OtherGroup.Sequences.size()) |
| 233 | return false; |
| 234 | |
| 235 | for (StmtSequence &Stmt : Group.Sequences) { |
| 236 | if (!containsAnyInGroup(Stmt, OtherGroup)) |
| 237 | return false; |
| 238 | } |
| 239 | return true; |
| 240 | } |
| 241 | } // end anonymous namespace |
| 242 | |
| 243 | void CloneDetector::findClones(std::vector<CloneGroup> &Result, |
| 244 | unsigned MinGroupComplexity) { |
| 245 | // Add every valid clone group that fulfills the complexity requirement. |
| 246 | for (const CloneGroup &Group : CloneGroups) { |
| 247 | if (Group.isValid() && Group.Complexity >= MinGroupComplexity) { |
| 248 | Result.push_back(Group); |
| 249 | } |
| 250 | } |
| 251 | |
| 252 | std::vector<unsigned> IndexesToRemove; |
| 253 | |
| 254 | // Compare every group in the result with the rest. If one groups contains |
| 255 | // another group, we only need to return the bigger group. |
| 256 | // Note: This doesn't scale well, so if possible avoid calling any heavy |
| 257 | // function from this loop to minimize the performance impact. |
| 258 | for (unsigned i = 0; i < Result.size(); ++i) { |
| 259 | for (unsigned j = 0; j < Result.size(); ++j) { |
| 260 | // Don't compare a group with itself. |
| 261 | if (i == j) |
| 262 | continue; |
| 263 | |
| 264 | if (containsGroup(Result[j], Result[i])) { |
| 265 | IndexesToRemove.push_back(i); |
| 266 | break; |
| 267 | } |
| 268 | } |
| 269 | } |
| 270 | |
| 271 | // Erasing a list of indexes from the vector should be done with decreasing |
| 272 | // indexes. As IndexesToRemove is constructed with increasing values, we just |
| 273 | // reverse iterate over it to get the desired order. |
| 274 | for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) { |
| 275 | Result.erase(Result.begin() + *I); |
| 276 | } |
| 277 | } |