| 1 | //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===// |
|---|---|
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file defines classes for searching and analyzing source code clones. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #ifndef LLVM_CLANG_AST_CLONEDETECTION_H |
| 15 | #define LLVM_CLANG_AST_CLONEDETECTION_H |
| 16 | |
| 17 | #include "clang/AST/StmtVisitor.h" |
| 18 | #include "llvm/Support/Regex.h" |
| 19 | #include <vector> |
| 20 | |
| 21 | namespace clang { |
| 22 | |
| 23 | class Stmt; |
| 24 | class Decl; |
| 25 | class VarDecl; |
| 26 | class ASTContext; |
| 27 | class CompoundStmt; |
| 28 | |
| 29 | /// Identifies a list of statements. |
| 30 | /// |
| 31 | /// Can either identify a single arbitrary Stmt object, a continuous sequence of |
| 32 | /// child statements inside a CompoundStmt or no statements at all. |
| 33 | class StmtSequence { |
| 34 | /// If this object identifies a sequence of statements inside a CompoundStmt, |
| 35 | /// S points to this CompoundStmt. If this object only identifies a single |
| 36 | /// Stmt, then S is a pointer to this Stmt. |
| 37 | const Stmt *S; |
| 38 | |
| 39 | /// The declaration that contains the statements. |
| 40 | const Decl *D; |
| 41 | |
| 42 | /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence |
| 43 | /// instance is representing the CompoundStmt children inside the array |
| 44 | /// [StartIndex, EndIndex). |
| 45 | unsigned StartIndex; |
| 46 | unsigned EndIndex; |
| 47 | |
| 48 | public: |
| 49 | /// Constructs a StmtSequence holding multiple statements. |
| 50 | /// |
| 51 | /// The resulting StmtSequence identifies a continuous sequence of statements |
| 52 | /// in the body of the given CompoundStmt. Which statements of the body should |
| 53 | /// be identified needs to be specified by providing a start and end index |
| 54 | /// that describe a non-empty sub-array in the body of the given CompoundStmt. |
| 55 | /// |
| 56 | /// \param Stmt A CompoundStmt that contains all statements in its body. |
| 57 | /// \param D The Decl containing this Stmt. |
| 58 | /// \param StartIndex The inclusive start index in the children array of |
| 59 | /// \p Stmt |
| 60 | /// \param EndIndex The exclusive end index in the children array of \p Stmt. |
| 61 | StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex, |
| 62 | unsigned EndIndex); |
| 63 | |
| 64 | /// Constructs a StmtSequence holding a single statement. |
| 65 | /// |
| 66 | /// \param Stmt An arbitrary Stmt. |
| 67 | /// \param D The Decl containing this Stmt. |
| 68 | StmtSequence(const Stmt *Stmt, const Decl *D); |
| 69 | |
| 70 | /// Constructs an empty StmtSequence. |
| 71 | StmtSequence(); |
| 72 | |
| 73 | typedef const Stmt *const *iterator; |
| 74 | |
| 75 | /// Returns an iterator pointing to the first statement in this sequence. |
| 76 | iterator begin() const; |
| 77 | |
| 78 | /// Returns an iterator pointing behind the last statement in this sequence. |
| 79 | iterator end() const; |
| 80 | |
| 81 | /// Returns the first statement in this sequence. |
| 82 | /// |
| 83 | /// This method should only be called on a non-empty StmtSequence object. |
| 84 | const Stmt *front() const { |
| 85 | assert(!empty()); |
| 86 | return begin()[0]; |
| 87 | } |
| 88 | |
| 89 | /// Returns the last statement in this sequence. |
| 90 | /// |
| 91 | /// This method should only be called on a non-empty StmtSequence object. |
| 92 | const Stmt *back() const { |
| 93 | assert(!empty()); |
| 94 | return begin()[size() - 1]; |
| 95 | } |
| 96 | |
| 97 | /// Returns the number of statements this object holds. |
| 98 | unsigned size() const { |
| 99 | if (holdsSequence()) |
| 100 | return EndIndex - StartIndex; |
| 101 | if (S == nullptr) |
| 102 | return 0; |
| 103 | return 1; |
| 104 | } |
| 105 | |
| 106 | /// Returns true if and only if this StmtSequence contains no statements. |
| 107 | bool empty() const { return size() == 0; } |
| 108 | |
| 109 | /// Returns the related ASTContext for the stored Stmts. |
| 110 | ASTContext &getASTContext() const; |
| 111 | |
| 112 | /// Returns the declaration that contains the stored Stmts. |
| 113 | const Decl *getContainingDecl() const { |
| 114 | assert(D); |
| 115 | return D; |
| 116 | } |
| 117 | |
| 118 | /// Returns true if this objects holds a list of statements. |
| 119 | bool holdsSequence() const { return EndIndex != 0; } |
| 120 | |
| 121 | /// Returns the start sourcelocation of the first statement in this sequence. |
| 122 | /// |
| 123 | /// This method should only be called on a non-empty StmtSequence object. |
| 124 | SourceLocation getBeginLoc() const; |
| 125 | |
| 126 | /// Returns the end sourcelocation of the last statement in this sequence. |
| 127 | /// |
| 128 | /// This method should only be called on a non-empty StmtSequence object. |
| 129 | SourceLocation getEndLoc() const; |
| 130 | |
| 131 | /// Returns the source range of the whole sequence - from the beginning |
| 132 | /// of the first statement to the end of the last statement. |
| 133 | SourceRange getSourceRange() const; |
| 134 | |
| 135 | bool operator==(const StmtSequence &Other) const { |
| 136 | return std::tie(S, StartIndex, EndIndex) == |
| 137 | std::tie(Other.S, Other.StartIndex, Other.EndIndex); |
| 138 | } |
| 139 | |
| 140 | bool operator!=(const StmtSequence &Other) const { |
| 141 | return std::tie(S, StartIndex, EndIndex) != |
| 142 | std::tie(Other.S, Other.StartIndex, Other.EndIndex); |
| 143 | } |
| 144 | |
| 145 | /// Returns true if and only if this sequence covers a source range that |
| 146 | /// contains the source range of the given sequence \p Other. |
| 147 | /// |
| 148 | /// This method should only be called on a non-empty StmtSequence object |
| 149 | /// and passed a non-empty StmtSequence object. |
| 150 | bool contains(const StmtSequence &Other) const; |
| 151 | }; |
| 152 | |
| 153 | /// Searches for similar subtrees in the AST. |
| 154 | /// |
| 155 | /// First, this class needs several declarations with statement bodies which |
| 156 | /// can be passed via analyzeCodeBody. Afterwards all statements can be |
| 157 | /// searched for clones by calling findClones with a given list of constraints |
| 158 | /// that should specify the wanted properties of the clones. |
| 159 | /// |
| 160 | /// The result of findClones can be further constrained with the constrainClones |
| 161 | /// method. |
| 162 | /// |
| 163 | /// This class only searches for clones in executable source code |
| 164 | /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) |
| 165 | /// are not supported. |
| 166 | class CloneDetector { |
| 167 | |
| 168 | public: |
| 169 | /// A collection of StmtSequences that share an arbitrary property. |
| 170 | typedef llvm::SmallVector<StmtSequence, 8> CloneGroup; |
| 171 | |
| 172 | /// Generates and stores search data for all statements in the body of |
| 173 | /// the given Decl. |
| 174 | void analyzeCodeBody(const Decl *D); |
| 175 | |
| 176 | /// Constrains the given list of clone groups with the given constraint. |
| 177 | /// |
| 178 | /// The constraint is expected to have a method with the signature |
| 179 | /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)` |
| 180 | /// as this is the interface that the CloneDetector uses for applying the |
| 181 | /// constraint. The constraint is supposed to directly modify the passed list |
| 182 | /// so that all clones in the list fulfill the specific property this |
| 183 | /// constraint ensures. |
| 184 | template <typename T> |
| 185 | static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) { |
| 186 | C.constrain(CloneGroups); |
| 187 | } |
| 188 | |
| 189 | /// Constrains the given list of clone groups with the given list of |
| 190 | /// constraints. |
| 191 | /// |
| 192 | /// The constraints are applied in sequence in the order in which they are |
| 193 | /// passed to this function. |
| 194 | template <typename T1, typename... Ts> |
| 195 | static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C, |
| 196 | Ts... ConstraintList) { |
| 197 | constrainClones(CloneGroups, C); |
| 198 | constrainClones(CloneGroups, ConstraintList...); |
| 199 | } |
| 200 | |
| 201 | /// Searches for clones in all previously passed statements. |
| 202 | /// \param Result Output parameter to which all created clone groups are |
| 203 | /// added. |
| 204 | /// \param ConstraintList The constraints that should be applied to the |
| 205 | // result. |
| 206 | template <typename... Ts> |
| 207 | void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) { |
| 208 | // The initial assumption is that there is only one clone group and every |
| 209 | // statement is a clone of the others. This clone group will then be |
| 210 | // split up with the help of the constraints. |
| 211 | CloneGroup AllClones; |
| 212 | AllClones.reserve(Sequences.size()); |
| 213 | for (const auto &C : Sequences) { |
| 214 | AllClones.push_back(C); |
| 215 | } |
| 216 | |
| 217 | Result.push_back(AllClones); |
| 218 | |
| 219 | constrainClones(Result, ConstraintList...); |
| 220 | } |
| 221 | |
| 222 | private: |
| 223 | CloneGroup Sequences; |
| 224 | }; |
| 225 | |
| 226 | /// This class is a utility class that contains utility functions for building |
| 227 | /// custom constraints. |
| 228 | class CloneConstraint { |
| 229 | public: |
| 230 | /// Removes all groups by using a filter function. |
| 231 | /// \param CloneGroups The list of CloneGroups that is supposed to be |
| 232 | /// filtered. |
| 233 | /// \param Filter The filter function that should return true for all groups |
| 234 | /// that should be removed from the list. |
| 235 | static void filterGroups( |
| 236 | std::vector<CloneDetector::CloneGroup> &CloneGroups, |
| 237 | llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) { |
| 238 | CloneGroups.erase( |
| 239 | std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter), |
| 240 | CloneGroups.end()); |
| 241 | } |
| 242 | |
| 243 | /// Splits the given CloneGroups until the given Compare function returns true |
| 244 | /// for all clones in a single group. |
| 245 | /// \param CloneGroups A list of CloneGroups that should be modified. |
| 246 | /// \param Compare The comparison function that all clones are supposed to |
| 247 | /// pass. Should return true if and only if two clones belong |
| 248 | /// to the same CloneGroup. |
| 249 | static void splitCloneGroups( |
| 250 | std::vector<CloneDetector::CloneGroup> &CloneGroups, |
| 251 | llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)> |
| 252 | Compare); |
| 253 | }; |
| 254 | |
| 255 | /// This constraint moves clones into clone groups of type II via hashing. |
| 256 | /// |
| 257 | /// Clones with different hash values are moved into separate clone groups. |
| 258 | /// Collisions are possible, and this constraint does nothing to address this |
| 259 | /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the |
| 260 | /// constraint chain, not necessarily immediately, to eliminate hash collisions |
| 261 | /// through a more detailed analysis. |
| 262 | class RecursiveCloneTypeIIHashConstraint { |
| 263 | public: |
| 264 | void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); |
| 265 | }; |
| 266 | |
| 267 | /// This constraint moves clones into clone groups of type II by comparing them. |
| 268 | /// |
| 269 | /// Clones that aren't type II clones are moved into separate clone groups. |
| 270 | /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone |
| 271 | /// group are guaranteed to be be type II clones of each other, but it is too |
| 272 | /// slow to efficiently handle large amounts of clones. |
| 273 | class RecursiveCloneTypeIIVerifyConstraint { |
| 274 | public: |
| 275 | void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); |
| 276 | }; |
| 277 | |
| 278 | /// Ensures that every clone has at least the given complexity. |
| 279 | /// |
| 280 | /// Complexity is here defined as the total amount of children of a statement. |
| 281 | /// This constraint assumes the first statement in the group is representative |
| 282 | /// for all other statements in the group in terms of complexity. |
| 283 | class MinComplexityConstraint { |
| 284 | unsigned MinComplexity; |
| 285 | |
| 286 | public: |
| 287 | MinComplexityConstraint(unsigned MinComplexity) |
| 288 | : MinComplexity(MinComplexity) {} |
| 289 | |
| 290 | /// Calculates the complexity of the given StmtSequence. |
| 291 | /// \param Limit The limit of complexity we probe for. After reaching |
| 292 | /// this limit during calculation, this method is exiting |
| 293 | /// early to improve performance and returns this limit. |
| 294 | size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit, |
| 295 | const std::string &ParentMacroStack = ""); |
| 296 | |
| 297 | void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { |
| 298 | CloneConstraint::filterGroups( |
| 299 | CloneGroups, [this](const CloneDetector::CloneGroup &A) { |
| 300 | if (!A.empty()) |
| 301 | return calculateStmtComplexity(A.front(), MinComplexity) < |
| 302 | MinComplexity; |
| 303 | else |
| 304 | return false; |
| 305 | }); |
| 306 | } |
| 307 | }; |
| 308 | |
| 309 | /// Ensures that all clone groups contain at least the given amount of clones. |
| 310 | class MinGroupSizeConstraint { |
| 311 | unsigned MinGroupSize; |
| 312 | |
| 313 | public: |
| 314 | MinGroupSizeConstraint(unsigned MinGroupSize = 2) |
| 315 | : MinGroupSize(MinGroupSize) {} |
| 316 | |
| 317 | void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { |
| 318 | CloneConstraint::filterGroups(CloneGroups, |
| 319 | [this](const CloneDetector::CloneGroup &A) { |
| 320 | return A.size() < MinGroupSize; |
| 321 | }); |
| 322 | } |
| 323 | }; |
| 324 | |
| 325 | /// Ensures that no clone group fully contains another clone group. |
| 326 | struct OnlyLargestCloneConstraint { |
| 327 | void constrain(std::vector<CloneDetector::CloneGroup> &Result); |
| 328 | }; |
| 329 | |
| 330 | struct FilenamePatternConstraint { |
| 331 | StringRef IgnoredFilesPattern; |
| 332 | std::shared_ptr<llvm::Regex> IgnoredFilesRegex; |
| 333 | |
| 334 | FilenamePatternConstraint(StringRef IgnoredFilesPattern) |
| 335 | : IgnoredFilesPattern(IgnoredFilesPattern) { |
| 336 | IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" + |
| 337 | IgnoredFilesPattern.str() + "$)"); |
| 338 | } |
| 339 | |
| 340 | bool isAutoGenerated(const CloneDetector::CloneGroup &Group); |
| 341 | |
| 342 | void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { |
| 343 | CloneConstraint::filterGroups( |
| 344 | CloneGroups, [this](const CloneDetector::CloneGroup &Group) { |
| 345 | return isAutoGenerated(Group); |
| 346 | }); |
| 347 | } |
| 348 | }; |
| 349 | |
| 350 | /// Analyzes the pattern of the referenced variables in a statement. |
| 351 | class VariablePattern { |
| 352 | |
| 353 | /// Describes an occurrence of a variable reference in a statement. |
| 354 | struct VariableOccurence { |
| 355 | /// The index of the associated VarDecl in the Variables vector. |
| 356 | size_t KindID; |
| 357 | /// The statement in the code where the variable was referenced. |
| 358 | const Stmt *Mention; |
| 359 | |
| 360 | VariableOccurence(size_t KindID, const Stmt *Mention) |
| 361 | : KindID(KindID), Mention(Mention) {} |
| 362 | }; |
| 363 | |
| 364 | /// All occurrences of referenced variables in the order of appearance. |
| 365 | std::vector<VariableOccurence> Occurences; |
| 366 | /// List of referenced variables in the order of appearance. |
| 367 | /// Every item in this list is unique. |
| 368 | std::vector<const VarDecl *> Variables; |
| 369 | |
| 370 | /// Adds a new variable referenced to this pattern. |
| 371 | /// \param VarDecl The declaration of the variable that is referenced. |
| 372 | /// \param Mention The SourceRange where this variable is referenced. |
| 373 | void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention); |
| 374 | |
| 375 | /// Adds each referenced variable from the given statement. |
| 376 | void addVariables(const Stmt *S); |
| 377 | |
| 378 | public: |
| 379 | /// Creates an VariablePattern object with information about the given |
| 380 | /// StmtSequence. |
| 381 | VariablePattern(const StmtSequence &Sequence) { |
| 382 | for (const Stmt *S : Sequence) |
| 383 | addVariables(S); |
| 384 | } |
| 385 | |
| 386 | /// Describes two clones that reference their variables in a different pattern |
| 387 | /// which could indicate a programming error. |
| 388 | struct SuspiciousClonePair { |
| 389 | /// Utility class holding the relevant information about a single |
| 390 | /// clone in this pair. |
| 391 | struct SuspiciousCloneInfo { |
| 392 | /// The variable which referencing in this clone was against the pattern. |
| 393 | const VarDecl *Variable; |
| 394 | /// Where the variable was referenced. |
| 395 | const Stmt *Mention; |
| 396 | /// The variable that should have been referenced to follow the pattern. |
| 397 | /// If Suggestion is a nullptr then it's not possible to fix the pattern |
| 398 | /// by referencing a different variable in this clone. |
| 399 | const VarDecl *Suggestion; |
| 400 | SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, |
| 401 | const VarDecl *Suggestion) |
| 402 | : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {} |
| 403 | SuspiciousCloneInfo() {} |
| 404 | }; |
| 405 | /// The first clone in the pair which always has a suggested variable. |
| 406 | SuspiciousCloneInfo FirstCloneInfo; |
| 407 | /// This other clone in the pair which can have a suggested variable. |
| 408 | SuspiciousCloneInfo SecondCloneInfo; |
| 409 | }; |
| 410 | |
| 411 | /// Counts the differences between this pattern and the given one. |
| 412 | /// \param Other The given VariablePattern to compare with. |
| 413 | /// \param FirstMismatch Output parameter that will be filled with information |
| 414 | /// about the first difference between the two patterns. This parameter |
| 415 | /// can be a nullptr, in which case it will be ignored. |
| 416 | /// \return Returns the number of differences between the pattern this object |
| 417 | /// is following and the given VariablePattern. |
| 418 | /// |
| 419 | /// For example, the following statements all have the same pattern and this |
| 420 | /// function would return zero: |
| 421 | /// |
| 422 | /// if (a < b) return a; return b; |
| 423 | /// if (x < y) return x; return y; |
| 424 | /// if (u2 < u1) return u2; return u1; |
| 425 | /// |
| 426 | /// But the following statement has a different pattern (note the changed |
| 427 | /// variables in the return statements) and would have two differences when |
| 428 | /// compared with one of the statements above. |
| 429 | /// |
| 430 | /// if (a < b) return b; return a; |
| 431 | /// |
| 432 | /// This function should only be called if the related statements of the given |
| 433 | /// pattern and the statements of this objects are clones of each other. |
| 434 | unsigned countPatternDifferences( |
| 435 | const VariablePattern &Other, |
| 436 | VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr); |
| 437 | }; |
| 438 | |
| 439 | /// Ensures that all clones reference variables in the same pattern. |
| 440 | struct MatchingVariablePatternConstraint { |
| 441 | void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups); |
| 442 | }; |
| 443 | |
| 444 | } // end namespace clang |
| 445 | |
| 446 | #endif // LLVM_CLANG_AST_CLONEDETECTION_H |
| 447 |