teemperor updated this revision to Diff 63851.
teemperor marked 18 inline comments as done.
teemperor added a comment.
- Checker is now in the alpha package and hidden.
- MinGroupComplexity is now a parameter for the checker.
- StmtData is now called CloneSignature.
- Replaced the std::map inside CloneDetector with a vector and a small cache in
the HashVisitor.
- Moved code from AST to Analysis.
- Fixed multiple other smaller problems pointed out in the review. (Thanks,
Vassil, Anna and Artem!)
http://reviews.llvm.org/D20795
Files:
include/clang/Analysis/CloneDetection.h
include/clang/StaticAnalyzer/Checkers/Checkers.td
lib/Analysis/CMakeLists.txt
lib/Analysis/CloneDetection.cpp
lib/StaticAnalyzer/Checkers/CMakeLists.txt
lib/StaticAnalyzer/Checkers/CloneChecker.cpp
test/Analysis/copypaste/test-min-max.cpp
test/Analysis/copypaste/test-sub-sequences.cpp
Index: test/Analysis/copypaste/test-sub-sequences.cpp
===================================================================
--- /dev/null
+++ test/Analysis/copypaste/test-sub-sequences.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// We test if sub-sequences can match with normal sequences containing only
+// a single statement.
+
+void log2(int a);
+void log();
+
+int max(int a, int b) {
+ log2(a);
+ log(); // expected-warning{{Detected code clone.}}
+ if (a > b)
+ return a;
+ return b;
+}
+
+int maxClone(int a, int b) { // expected-note{{Related code clone is here.}}
+ log();
+ if (a > b)
+ return a;
+ return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) { // no-warning
+ return a + b;
+}
Index: test/Analysis/copypaste/test-min-max.cpp
===================================================================
--- /dev/null
+++ test/Analysis/copypaste/test-min-max.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+void log();
+
+int max(int a, int b) { // expected-warning{{Detected code clone.}}
+ log();
+ if (a > b)
+ return a;
+ return b;
+}
+
+int maxClone(int x, int y) { // expected-note{{Related code clone is here.}}
+ log();
+ if (x > y)
+ return x;
+ return y;
+}
+
+// False positives below. These clones should not be reported.
+
+// FIXME: Detect different binary operator kinds.
+int min1(int a, int b) { // expected-note{{Related code clone is here.}}
+ log();
+ if (a < b)
+ return a;
+ return b;
+}
+
+// FIXME: Detect different variable patterns.
+int min2(int a, int b) { // expected-note{{Related code clone is here.}}
+ log();
+ if (b > a)
+ return a;
+ return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) { // no-warning
+ return a + b;
+}
Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp
===================================================================
--- /dev/null
+++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp
@@ -0,0 +1,80 @@
+//===--- CloneChecker.cpp - Clone detection checker -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// CloneChecker is a checker that reports clones in the current translation
+/// unit.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ClangSACheckers.h"
+#include "clang/Analysis/CloneDetection.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/CheckerManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+class CloneChecker : public Checker<check::EndOfTranslationUnit> {
+
+public:
+ void checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+ AnalysisManager &Mgr, BugReporter &BR) const;
+};
+} // end anonymous namespace
+
+void CloneChecker::checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+ AnalysisManager &Mgr,
+ BugReporter &BR) const {
+
+ int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger(
+ "MinimumCloneComplexity", 10, this);
+
+ assert(MinComplexity >= 0);
+
+ SourceManager &SM = BR.getSourceManager();
+
+ CloneDetector CloneDetector;
+ CloneDetector.analyzeTranslationUnit(const_cast<TranslationUnitDecl *>(TU));
+
+ std::vector<CloneDetector::CloneGroup> CloneGroups;
+ CloneDetector.findClones(CloneGroups, MinComplexity);
+
+ DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
+
+ unsigned WarnID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Warning,
+ "Detected code clone.");
+
+ unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note,
+ "Related code clone is here.");
+
+ for (CloneDetector::CloneGroup &Group : CloneGroups) {
+ std::sort(Group.begin(), Group.end(), [&SM](const StmtSequence &LHS,
+ const StmtSequence &RHS) {
+ return SM.isBeforeInTranslationUnit(LHS.getStartLoc(),
+ RHS.getStartLoc()) &&
+ SM.isBeforeInTranslationUnit(LHS.getEndLoc(), RHS.getEndLoc());
+ });
+ DiagEngine.Report(Group.front().getStartLoc(), WarnID);
+ for (unsigned J = 1; J < Group.size(); ++J) {
+ DiagEngine.Report(Group[J].getStartLoc(), NoteID);
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Register CloneChecker
+//===----------------------------------------------------------------------===//
+
+void ento::registerCloneChecker(CheckerManager &Mgr) {
+ Mgr.registerChecker<CloneChecker>();
+}
Index: lib/StaticAnalyzer/Checkers/CMakeLists.txt
===================================================================
--- lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -22,6 +22,7 @@
CheckerDocumentation.cpp
ChrootChecker.cpp
ClangCheckers.cpp
+ CloneChecker.cpp
DeadStoresChecker.cpp
DebugCheckers.cpp
DereferenceChecker.cpp
Index: lib/Analysis/CloneDetection.cpp
===================================================================
--- /dev/null
+++ lib/Analysis/CloneDetection.cpp
@@ -0,0 +1,433 @@
+//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file implements classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/CloneDetection.h"
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Stmt.h"
+
+using namespace clang;
+
+StmtSequence::StmtSequence(CompoundStmt const *Stmt, ASTContext &Context,
+ unsigned StartIndex, unsigned EndIndex)
+ : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
+ assert(Stmt && "Stmt must not be a nullptr");
+ assert(StartIndex < EndIndex && "Given array should not be empty");
+ assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
+}
+
+StmtSequence::StmtSequence(Stmt const *Stmt, ASTContext &Context)
+ : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}
+
+StmtSequence::StmtSequence()
+ : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}
+
+bool StmtSequence::contains(const StmtSequence &Other) const {
+ // If both sequences reside in different translation units, they can never
+ // contain each other.
+ if (Context != Other.Context)
+ return false;
+
+ const SourceManager &SM = Context->getSourceManager();
+
+ // Otherwise check if the start and end locations of the current sequence
+ // surround the other sequence.
+ bool StartIsInBounds =
+ SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
+ getStartLoc() == Other.getStartLoc();
+ if (!StartIsInBounds)
+ return false;
+
+ bool EndIsInBounds =
+ SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
+ Other.getEndLoc() == getEndLoc();
+ return EndIsInBounds;
+}
+
+StmtSequence::iterator StmtSequence::begin() const {
+ if (!holdsSequence()) {
+ return &S;
+ }
+ auto CS = cast<CompoundStmt>(S);
+ return CS->body_begin() + StartIndex;
+}
+
+StmtSequence::iterator StmtSequence::end() const {
+ if (!holdsSequence()) {
+ return &S + 1;
+ }
+ auto CS = cast<CompoundStmt>(S);
+ return CS->body_begin() + EndIndex;
+}
+
+SourceLocation StmtSequence::getStartLoc() const {
+ return front()->getLocStart();
+}
+
+SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
+
+namespace {
+/// \brief A cache for temporarily storing a small amount of CloneSignatures.
+///
+/// This cache should be manually cleaned from CloneSignatures that are
+/// no longer needed by calling \p remove or \p removeChildren . Otherwise the
+/// cache has to allocate memory to store the increased amount of signatures.
+/// However, the lookup time for the most recently added CloneSignatures won't
+/// deteriorate even if the cache is not properly cleaned.
+class CloneSignatureCache {
+ llvm::SmallVector<CloneDetector::CloneSignature, 16> Cache;
+
+public:
+ /// \brief Adds the Signature into the cache.
+ void add(CloneDetector::CloneSignature const &Signature) {
+ Cache.push_back(Signature);
+ }
+
+ /// \brief Removes the CloneSignature that describes the given Sequence.
+ void remove(StmtSequence const &Sequence) {
+ Cache.erase(
+ std::remove_if(
+ Cache.begin(), Cache.end(),
+ [&Sequence](CloneDetector::CloneSignature const &Signature) {
+ return Signature.Sequence == Sequence;
+ }),
+ Cache.end());
+ }
+
+ /// \brief Removes the associated signature for each child of the given Stmt.
+ /// \param Parent The given Stmt.
+ /// \param Context The ASTContext that contains Parent.
+ void removeChildren(Stmt const *Parent, ASTContext &Context) {
+ for (Stmt const *Child : Parent->children()) {
+ StmtSequence ChildSequence(Child, Context);
+ remove(ChildSequence);
+ }
+ }
+
+ /// \brief Retrieves the CloneSignature that describes the given Sequence.
+ CloneDetector::CloneSignature get(StmtSequence const &S) const {
+ // This cache promises good lookup time for recently added CloneSignatures
+ // even if not properly cleaned. As the most recently added CloneSignature
+ // is at the end, we start searching from back of the cache to the front to
+ // keep that promise.
+ for (auto I = Cache.rbegin(); I != Cache.rend(); ++I) {
+ if (I->Sequence == S) {
+ return *I;
+ }
+ }
+ llvm_unreachable("Couldn't find CloneSignature for StmtSequence");
+ }
+};
+} // end anonymous namespace
+
+namespace {
+/// Traverses the AST and calculates hash values and complexity values for all
+/// statements. The results are stored in a CloneDetector object.
+///
+/// This calculation happens in linear time and each statement is only visited a
+/// fixed amount of times during this process. The hash value for a statement is
+/// first calculated from the data stored directly in the statement object.
+/// Afterwards, the hash values of the children are calculated into the
+/// computed hash value.
+class HashVisitor : public RecursiveASTVisitor<HashVisitor> {
+
+ /// \brief Defines a placeholder hash value for missing child statements.
+ ///
+ /// The value of this constant should be a integer that is unlikely to be
+ /// generated by normally hashing to prevent unintended hash value collisions.
+ static const unsigned NullChildHashValue = 313;
+
+ CloneDetector &CD;
+ ASTContext &Context;
+
+ /// \brief Stores the signatures of visited nodes with an unvisited parent.
+ ///
+ /// This cache is necessary as the signature of a node is partly computed
+ /// from the data of its children. It's the reponsibility of the child node to
+ /// insert its own signature into the cache and the parents responsibility to
+ /// remove the signatures of its children.
+ CloneSignatureCache SignatureCache;
+
+public:
+ explicit HashVisitor(CloneDetector &CD, ASTContext &Context)
+ : CD(CD), Context(Context) {}
+
+ /// \brief Overwritten function that this visitor will traverse in postorder.
+ ///
+ /// We need to traverse postorder over the AST for our algorithm
+ /// to work as each parent expects that its children were already processed.
+ bool shouldTraversePostOrder() const { return true; }
+
+ bool VisitStmt(Stmt *S);
+
+ // Some statements have no parent that could remove their CloneSignatures from
+ // the cache. We manually clean such orphaned clones in the Visit* methods
+ // below. It's not a critical error if a certain declaration isn't handled
+ // here and it only slighly affects the memory consumption during traversal.
+
+ bool VisitFunctionDecl(FunctionDecl *D) {
+ if (D->hasBody())
+ SignatureCache.removeChildren(D->getBody(), Context);
+ return true;
+ }
+ bool VisitVarDecl(VarDecl *D) {
+ if (D->hasInit())
+ SignatureCache.removeChildren(D->getInit(), Context);
+ return true;
+ }
+ bool VisitParmVarDecl(ParmVarDecl *D) {
+ if (D->hasDefaultArg())
+ SignatureCache.removeChildren(D->getDefaultArg(), Context);
+ return true;
+ }
+ bool VisitCXXCtorInitializer(CXXCtorInitializer *D) {
+ SignatureCache.removeChildren(D->getInit(), Context);
+ return true;
+ }
+ bool VisitFieldDecl(FieldDecl *D) {
+ if (D->hasInClassInitializer())
+ SignatureCache.removeChildren(D->getInClassInitializer(), Context);
+ return true;
+ }
+
+private:
+ void HandleChildHashes(llvm::FoldingSetNodeID &Hash, unsigned &Complexity,
+ Stmt::const_child_iterator Iterator, unsigned StartPos,
+ unsigned Length) {
+ auto StartIterator = Iterator;
+ for (unsigned i = 0; i < StartPos; ++i)
+ ++StartIterator;
+ auto EndIterator = StartIterator;
+ for (unsigned i = 0; i < Length; ++i)
+ ++EndIterator;
+ Stmt::const_child_range Children(StartIterator, EndIterator);
+ HandleChildHashes(Hash, Complexity, Children);
+ }
+
+ /// Iterates over the specified Stmt array and computes the hash values of all
+ /// statements into the given hash value. Also, the complexity of all
+ /// statements is added to the given complexity value.
+ void HandleChildHashes(llvm::FoldingSetNodeID &Hash, unsigned &Complexity,
+ Stmt::const_child_range Children) {
+ // Iterate over each child in the sub-sequence.
+ for (Stmt const *Child : Children) {
+ if (Child == nullptr) {
+ ++Complexity;
+ // We use an placeholder hash value for null children.
+ Hash.AddInteger(NullChildHashValue);
+ } else {
+ auto Signature = SignatureCache.get(StmtSequence(Child, Context));
+ Complexity += Signature.Complexity;
+ Hash.AddInteger(Signature.Hash);
+ }
+ }
+ }
+
+ /// \brief Creates and saves the CloneSignatures for all sub-sequences in the
+ /// given
+ /// CompoundStmt.
+ ///
+ /// A sub-sequence is a continuous sequence of statements inside the child
+ /// array of the CompoundStmt.
+ void SaveAllSubSequences(CompoundStmt const *CS);
+};
+} // end anonymous namespace
+
+bool HashVisitor::VisitStmt(Stmt *S) {
+ StmtSequence CurrentStmt(S, Context);
+ llvm::FoldingSetNodeID Hash;
+ unsigned Complexity = 1;
+
+ // The only relevant data for now is the class of the statement, so we
+ // calculate the hash class into the current hash value.
+ // TODO: Handle statement class specific data.
+ Hash.AddInteger(S->getStmtClass());
+
+ // Incorporate the hash values of all child Stmts into the current hash value.
+ HandleChildHashes(Hash, Complexity, S->children());
+
+ CloneDetector::CloneSignature Signature(CurrentStmt, Hash.ComputeHash(),
+ Complexity);
+ // Add the signature to the cache that the parent can use it.
+ SignatureCache.add(Signature);
+
+ // Save the signature for the current sequence in the CloneDetector object.
+ CD.add(Signature);
+
+ // If the currently hashed statement is a CompoundStmt, we also need to handle
+ // the sub-sequences inside the CompoundStmt.
+ if (auto CS = dyn_cast<CompoundStmt>(S))
+ SaveAllSubSequences(CS);
+
+ // We no longer need the signatures of the children, so we remove them
+ // from the cache.
+ SignatureCache.removeChildren(S, Context);
+ return true;
+}
+
+void HashVisitor::SaveAllSubSequences(CompoundStmt const *CS) {
+ // The length of the sub-sequence. We don't need to hash sequences with the
+ // length 1 as they are already handled when we hashed the children. We also
+ // don't need to hash the sub-sequence that contains the whole body as this
+ // would be equal to the hash of the CompoundStmt itself.
+ for (unsigned Length = 2; Length < CS->size(); ++Length) {
+ // The start index in the body of the CompoundStmt.
+ // We increase the position and rehash until the end of the sub-sequence
+ // reaches the end of the CompoundStmt body.
+ for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
+ llvm::FoldingSetNodeID SubHash;
+ unsigned SubComplexity = 1;
+ StmtSequence Sequence(CS, Context, Pos, Pos + Length);
+
+ // We consider a sub-sequence to be CompoundStmt that only contains
+ // the children in the sub-sequence. This is necessary for matching
+ // sub-sequences with full CompoundStmts.
+ SubHash.AddInteger(CS->getStmtClass());
+
+ HandleChildHashes(SubHash, SubComplexity, CS->child_begin(), Pos, Length);
+
+ CloneDetector::CloneSignature SubData(Sequence, SubHash.ComputeHash(),
+ SubComplexity);
+ CD.add(SubData);
+ }
+ }
+}
+
+void CloneDetector::analyzeTranslationUnit(TranslationUnitDecl *TU) {
+ HashVisitor visitor(*this, TU->getASTContext());
+ visitor.TraverseDecl(TU);
+}
+
+void CloneDetector::add(const CloneSignature &Data) {
+ Signatures.push_back(Data);
+}
+
+namespace {
+/// \brief Returns true if and only if \p Stmt contains at least one other
+/// sequence in the \p Group.
+bool StmtContainsAnyInGroup(StmtSequence &Stmt,
+ CloneDetector::CloneGroup &Group) {
+ for (StmtSequence &GroupStmt : Group) {
+ if (Stmt.contains(GroupStmt))
+ return true;
+ }
+ return false;
+}
+
+/// \brief Returns true if and only if all sequences in \p OtherGroup are
+/// contained by a sequence in \p Group.
+bool GroupContains(CloneDetector::CloneGroup &Group,
+ CloneDetector::CloneGroup &OtherGroup) {
+ // We have less sequences in the current group than we have in the other,
+ // so we will never fulfill the requirement for returning true. This is only
+ // possible because we know that a sequence in Group can contain at most
+ // one sequence in OtherGroup.
+ if (Group.size() < OtherGroup.size())
+ return false;
+
+ for (StmtSequence &Stmt : Group) {
+ if (!StmtContainsAnyInGroup(Stmt, OtherGroup))
+ return false;
+ }
+ return true;
+}
+} // end anonymous namespace
+
+void CloneDetector::findClones(std::vector<CloneGroup> &Result,
+ unsigned MinGroupComplexity) {
+ // For creating hash groups, we need to find CloneSignatures with the same
+ // hash value, so we first sort all CloneSignatures by their hash value. Then
+ // we search for sequences of CloneSignatures with the same hash value.
+ std::sort(Signatures.begin(), Signatures.end(),
+ [](const CloneSignature &A, const CloneSignature &B) {
+ return A.Hash < B.Hash;
+ });
+
+ // Check for each CloneSignature if its successor has the same hash value.
+ // We don't check the last CloneSignature as it has no successor.
+ for (unsigned i = 0; i < Signatures.size() - 1; ++i) {
+ CloneSignature const &Current = Signatures[i];
+ CloneSignature const &Next = Signatures[i + 1];
+
+ if (Current.Hash != Next.Hash)
+ continue;
+
+ // It's likely that we just found an sequence of CloneSignatures that
+ // represent a CloneGroup, so we create a new group and start checking and
+ // adding the CloneSignatures in this sequence.
+ CloneGroup Group;
+
+ for (; i < Signatures.size(); ++i) {
+ CloneSignature const &Signature = Signatures[i];
+
+ // A different hash value means we have reached the end of the sequence.
+ if (Current.Hash != Signature.Hash) {
+ // The current Signature could be the start of a new CloneGroup. So we
+ // decrement i so that we visit it again in the outer loop.
+ // Note: i can never be 0 at this point because we are just comparing
+ // the hash of the Current CloneSignature with itself in the 'if' above.
+ assert(i != 0);
+ --i;
+ break;
+ }
+
+ // Skip CloneSignatures that won't pass the complexity requirement.
+ if (Signature.Complexity < MinGroupComplexity)
+ continue;
+
+ Group.push_back(Signature.Sequence);
+ }
+
+ // There is a chance that we haven't found more than two fitting
+ // CloneSignature because not enough CloneSignatures passed the complexity
+ // requirement. As a CloneGroup with less than two members makes no sense,
+ // we ignore this CloneGroup and won't add it to the result.
+ if (Group.size() <= 1)
+ continue;
+
+ Result.push_back(Group);
+ }
+
+ std::vector<unsigned> IndexesToRemove;
+
+ // Compare every group in the result with the rest. If one groups contains
+ // another group, we only need to return the bigger group.
+ // Note: This doesn't scale well, so if possible avoid calling any heavy
+ // function from this loop to reduce the negative performance impact.
+ for (unsigned i = 0; i < Result.size(); ++i) {
+ for (unsigned j = 0; j < Result.size(); ++j) {
+ // Don't compare a group with itself.
+ if (i == j)
+ continue;
+
+ if (GroupContains(Result[j], Result[i])) {
+ IndexesToRemove.push_back(i);
+ break;
+ }
+ }
+ }
+
+ // We need to erase invalid results from the result vector with decreasing
+ // unique indexes. For this we sort all indexes in descending order and then
+ // remove any duplicates.
+ std::sort(IndexesToRemove.begin(), IndexesToRemove.end(),
+ std::greater<unsigned>());
+
+ auto UniqueEnd = std::unique(IndexesToRemove.begin(), IndexesToRemove.end());
+ IndexesToRemove.erase(UniqueEnd, IndexesToRemove.end());
+
+ for (unsigned i : IndexesToRemove) {
+ Result.erase(Result.begin() + i);
+ }
+}
Index: lib/Analysis/CMakeLists.txt
===================================================================
--- lib/Analysis/CMakeLists.txt
+++ lib/Analysis/CMakeLists.txt
@@ -9,6 +9,7 @@
CFGReachabilityAnalysis.cpp
CFGStmtMap.cpp
CallGraph.cpp
+ CloneDetection.cpp
CocoaConventions.cpp
Consumed.cpp
CodeInjector.cpp
Index: include/clang/StaticAnalyzer/Checkers/Checkers.td
===================================================================
--- include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -77,6 +77,8 @@
def LLVM : Package<"llvm">;
def Debug : Package<"debug">;
+def CloneDetectionAlpha : Package<"clone">, InPackage<Alpha>, Hidden;
+
//===----------------------------------------------------------------------===//
// Core Checkers.
//===----------------------------------------------------------------------===//
@@ -657,3 +659,17 @@
DescFile<"DebugCheckers.cpp">;
} // end "debug"
+
+
+//===----------------------------------------------------------------------===//
+// Clone Detection
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = CloneDetectionAlpha in {
+
+def CloneChecker : Checker<"CloneChecker">,
+ HelpText<"Reports similar pieces of code.">,
+ DescFile<"CloneChecker.cpp">;
+
+} // end "clone"
+
Index: include/clang/Analysis/CloneDetection.h
===================================================================
--- /dev/null
+++ include/clang/Analysis/CloneDetection.h
@@ -0,0 +1,216 @@
+//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// /file
+/// This file defines classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_CLONEDETECTION_H
+#define LLVM_CLANG_AST_CLONEDETECTION_H
+
+#include "clang/Basic/SourceLocation.h"
+
+#include <vector>
+
+namespace clang {
+
+class Stmt;
+class ASTContext;
+class CompoundStmt;
+class TranslationUnitDecl;
+
+/// \brief Identifies a list of statements.
+///
+/// Can either identify a single arbitrary Stmt object, a continuous sequence of
+/// child statements inside a CompoundStmt or no statements at all.
+class StmtSequence {
+ /// If this object identifies a sequence of statements inside a CompoundStmt,
+ /// S points to this CompoundStmt. If this object only identifies a single
+ /// Stmt, then S is a pointer to this Stmt.
+ Stmt const *S;
+
+ /// The related ASTContext for S.
+ ASTContext *Context;
+
+ /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
+ /// instance is representing the CompoundStmt children inside the array
+ /// [StartIndex, EndIndex).
+ unsigned StartIndex;
+ unsigned EndIndex;
+
+public:
+ /// \brief Constructs a StmtSequence holding multiple statements.
+ ///
+ /// The resulting StmtSequence identifies a continuous sequence of statements
+ /// in the body of the given CompoundStmt. Which statements of the body should
+ /// be identified needs to be specified by providing a start and end index
+ /// that describe a non-empty sub-array in the body of the given CompoundStmt.
+ ///
+ /// \param Stmt A CompoundStmt that contains all statements in its body.
+ /// \param Context The ASTContext for the given CompoundStmt.
+ /// \param StartIndex The inclusive start index in the children array of
+ /// \p Stmt
+ /// \param EndIndex The exclusive end index in the children array of \p Stmt.
+ StmtSequence(CompoundStmt const *Stmt, ASTContext &Context,
+ unsigned StartIndex, unsigned EndIndex);
+
+ /// \brief Constructs a StmtSequence holding a single statement.
+ ///
+ /// \param Stmt An arbitrary Stmt.
+ /// \param Context The ASTContext for the given Stmt.
+ StmtSequence(Stmt const *Stmt, ASTContext &Context);
+
+ /// \brief Constructs an empty StmtSequence.
+ StmtSequence();
+
+ typedef Stmt const *const *iterator;
+
+ /// Returns an iterator pointing to the first statement in this sequence.
+ iterator begin() const;
+
+ /// Returns an iterator pointing behind the last statement in this sequence.
+ iterator end() const;
+
+ /// Returns the first statement in this sequence.
+ ///
+ /// This method should only be called on a non-empty StmtSequence object.
+ Stmt const *front() const {
+ assert(!empty());
+ return begin()[0];
+ }
+
+ /// Returns the last statement in this sequence.
+ ///
+ /// This method should only be called on a non-empty StmtSequence object.
+ Stmt const *back() const {
+ assert(!empty());
+ return begin()[size() - 1];
+ }
+
+ /// Returns the number of statements this object holds.
+ unsigned size() const {
+ if (holdsSequence())
+ return EndIndex - StartIndex;
+ if (S == nullptr)
+ return 0;
+ return 1;
+ }
+
+ /// Returns true if and only if this StmtSequence contains no statements.
+ bool empty() const { return size() == 0; }
+
+ /// Returns the related ASTContext for the stored Stmts.
+ ASTContext &getASTContext() const {
+ assert(Context);
+ return *Context;
+ }
+
+ /// Returns true if this objects holds a list of statements.
+ bool holdsSequence() const { return EndIndex != 0; }
+
+ /// Returns the start sourcelocation of the first statement in this sequence.
+ ///
+ /// This method should only be called on a non-empty StmtSequence object.
+ SourceLocation getStartLoc() const;
+
+ /// Returns the end sourcelocation of the last statement in this sequence.
+ ///
+ /// This method should only be called on a non-empty StmtSequence object.
+ SourceLocation getEndLoc() const;
+
+ bool operator==(const StmtSequence &Other) const {
+ return std::tie(S, StartIndex, EndIndex) ==
+ std::tie(Other.S, Other.StartIndex, Other.EndIndex);
+ }
+
+ bool operator!=(const StmtSequence &Other) const {
+ return std::tie(S, StartIndex, EndIndex) !=
+ std::tie(Other.S, Other.StartIndex, Other.EndIndex);
+ }
+
+ /// Returns true if and only if this sequence covers a source range that
+ /// contains the source range of the given sequence \p Other.
+ ///
+ /// This method should only be called on a non-empty StmtSequence object
+ /// and passed a non-empty StmtSequence object.
+ bool contains(const StmtSequence &Other) const;
+};
+
+/// \brief Searches for clones in source code.
+///
+/// First, this class needs a translation unit which is passed via
+/// \p analyzeTranslationUnit . It will then generate and store search data
+/// for all statements inside the given translation unit.
+/// Afterwards the generated data can be used to find code clones by calling
+/// \p findClones .
+///
+/// The generates search data is needed for efficiently finding
+/// clones. This is done by hashing all statements using a locality-sensitive
+/// hash function that generates identical hash values for similar statement
+/// trees.
+///
+/// This class only searches for clones in exectuable source code
+/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
+/// are not supported.
+class CloneDetector {
+public:
+ /// Holds the data about a StmtSequence that is needed during the search for
+ /// code clones.
+ struct CloneSignature {
+ /// The StmtSequence that is described by this CloneSignature.
+ StmtSequence Sequence;
+
+ /// The generated hash value for the StmtSequence. Two CloneSignature
+ /// objects with the same hash value will probably contain two StmtSequence
+ /// objects that are clones of each other. However, false positives are
+ /// possible here due to unintended hash function collisions.
+ ///
+ /// Two CloneSignature objects with a different hash value will never
+ /// contain two StmtSequence objects that are clones of each other.
+ unsigned Hash;
+
+ /// \brief The complexity of the StmtSequence.
+ ///
+ /// This scalar value serves as a simple way of filtering clones that are
+ /// too small to be reported. A greater value indicates that the related
+ /// StmtSequence is probably more interesting to the user.
+ unsigned Complexity;
+ CloneSignature() : Hash(0), Complexity(0) {}
+
+ CloneSignature(const StmtSequence &S, unsigned Hash, unsigned Complexity)
+ : Sequence(S), Hash(Hash), Complexity(Complexity) {}
+ };
+
+ /// \brief Generates and stores search data for all statements in the given
+ /// translation unit.
+ void analyzeTranslationUnit(TranslationUnitDecl *TU);
+
+ /// \brief Stores the CloneSignature to allow future querying.
+ void add(const CloneSignature &Data);
+
+ typedef llvm::SmallVector<StmtSequence, 4> CloneGroup;
+
+ /// \brief Searches the provided statements for clones.
+ ///
+ /// \param Result Output parameter that is filled with a list of found
+ /// clone groups. Each group contains multiple StmtSequences
+ /// that were identified to be clones of each other.
+ /// \param MinGroupComplexity Only return clones which have at least this
+ /// complexity value.
+ void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity);
+
+private:
+ /// Stores all provided statements and their associated search data.
+ std::vector<CloneSignature> Signatures;
+};
+
+} // end namespace clang
+
+#endif // LLVM_CLANG_AST_CLONEDETECTION_H
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits