clang  7.0.0
CloneDetection.h
Go to the documentation of this file.
1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// This file defines classes for searching and analyzing source code clones.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
16 #define LLVM_CLANG_AST_CLONEDETECTION_H
17 
18 #include "clang/AST/StmtVisitor.h"
19 #include "llvm/Support/Regex.h"
20 #include <vector>
21 
22 namespace clang {
23 
24 class Stmt;
25 class Decl;
26 class VarDecl;
27 class ASTContext;
28 class CompoundStmt;
29 
30 /// Identifies a list of statements.
31 ///
32 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
33 /// child statements inside a CompoundStmt or no statements at all.
34 class StmtSequence {
35  /// If this object identifies a sequence of statements inside a CompoundStmt,
36  /// S points to this CompoundStmt. If this object only identifies a single
37  /// Stmt, then S is a pointer to this Stmt.
38  const Stmt *S;
39 
40  /// The declaration that contains the statements.
41  const Decl *D;
42 
43  /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
44  /// instance is representing the CompoundStmt children inside the array
45  /// [StartIndex, EndIndex).
46  unsigned StartIndex;
47  unsigned EndIndex;
48 
49 public:
50  /// Constructs a StmtSequence holding multiple statements.
51  ///
52  /// The resulting StmtSequence identifies a continuous sequence of statements
53  /// in the body of the given CompoundStmt. Which statements of the body should
54  /// be identified needs to be specified by providing a start and end index
55  /// that describe a non-empty sub-array in the body of the given CompoundStmt.
56  ///
57  /// \param Stmt A CompoundStmt that contains all statements in its body.
58  /// \param D The Decl containing this Stmt.
59  /// \param StartIndex The inclusive start index in the children array of
60  /// \p Stmt
61  /// \param EndIndex The exclusive end index in the children array of \p Stmt.
62  StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
63  unsigned EndIndex);
64 
65  /// Constructs a StmtSequence holding a single statement.
66  ///
67  /// \param Stmt An arbitrary Stmt.
68  /// \param D The Decl containing this Stmt.
69  StmtSequence(const Stmt *Stmt, const Decl *D);
70 
71  /// Constructs an empty StmtSequence.
72  StmtSequence();
73 
74  typedef const Stmt *const *iterator;
75 
76  /// Returns an iterator pointing to the first statement in this sequence.
77  iterator begin() const;
78 
79  /// Returns an iterator pointing behind the last statement in this sequence.
80  iterator end() const;
81 
82  /// Returns the first statement in this sequence.
83  ///
84  /// This method should only be called on a non-empty StmtSequence object.
85  const Stmt *front() const {
86  assert(!empty());
87  return begin()[0];
88  }
89 
90  /// Returns the last statement in this sequence.
91  ///
92  /// This method should only be called on a non-empty StmtSequence object.
93  const Stmt *back() const {
94  assert(!empty());
95  return begin()[size() - 1];
96  }
97 
98  /// Returns the number of statements this object holds.
99  unsigned size() const {
100  if (holdsSequence())
101  return EndIndex - StartIndex;
102  if (S == nullptr)
103  return 0;
104  return 1;
105  }
106 
107  /// Returns true if and only if this StmtSequence contains no statements.
108  bool empty() const { return size() == 0; }
109 
110  /// Returns the related ASTContext for the stored Stmts.
111  ASTContext &getASTContext() const;
112 
113  /// Returns the declaration that contains the stored Stmts.
114  const Decl *getContainingDecl() const {
115  assert(D);
116  return D;
117  }
118 
119  /// Returns true if this objects holds a list of statements.
120  bool holdsSequence() const { return EndIndex != 0; }
121 
122  /// Returns the start sourcelocation of the first statement in this sequence.
123  ///
124  /// This method should only be called on a non-empty StmtSequence object.
125  SourceLocation getStartLoc() const LLVM_READONLY { return getBeginLoc(); }
126  SourceLocation getBeginLoc() const;
127 
128  /// Returns the end sourcelocation of the last statement in this sequence.
129  ///
130  /// This method should only be called on a non-empty StmtSequence object.
131  SourceLocation getEndLoc() const;
132 
133  /// Returns the source range of the whole sequence - from the beginning
134  /// of the first statement to the end of the last statement.
135  SourceRange getSourceRange() const;
136 
137  bool operator==(const StmtSequence &Other) const {
138  return std::tie(S, StartIndex, EndIndex) ==
139  std::tie(Other.S, Other.StartIndex, Other.EndIndex);
140  }
141 
142  bool operator!=(const StmtSequence &Other) const {
143  return std::tie(S, StartIndex, EndIndex) !=
144  std::tie(Other.S, Other.StartIndex, Other.EndIndex);
145  }
146 
147  /// Returns true if and only if this sequence covers a source range that
148  /// contains the source range of the given sequence \p Other.
149  ///
150  /// This method should only be called on a non-empty StmtSequence object
151  /// and passed a non-empty StmtSequence object.
152  bool contains(const StmtSequence &Other) const;
153 };
154 
155 /// Searches for similar subtrees in the AST.
156 ///
157 /// First, this class needs several declarations with statement bodies which
158 /// can be passed via analyzeCodeBody. Afterwards all statements can be
159 /// searched for clones by calling findClones with a given list of constraints
160 /// that should specify the wanted properties of the clones.
161 ///
162 /// The result of findClones can be further constrained with the constrainClones
163 /// method.
164 ///
165 /// This class only searches for clones in executable source code
166 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
167 /// are not supported.
169 
170 public:
171  /// A collection of StmtSequences that share an arbitrary property.
173 
174  /// Generates and stores search data for all statements in the body of
175  /// the given Decl.
176  void analyzeCodeBody(const Decl *D);
177 
178  /// Constrains the given list of clone groups with the given constraint.
179  ///
180  /// The constraint is expected to have a method with the signature
181  /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
182  /// as this is the interface that the CloneDetector uses for applying the
183  /// constraint. The constraint is supposed to directly modify the passed list
184  /// so that all clones in the list fulfill the specific property this
185  /// constraint ensures.
186  template <typename T>
187  static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
188  C.constrain(CloneGroups);
189  }
190 
191  /// Constrains the given list of clone groups with the given list of
192  /// constraints.
193  ///
194  /// The constraints are applied in sequence in the order in which they are
195  /// passed to this function.
196  template <typename T1, typename... Ts>
197  static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
198  Ts... ConstraintList) {
199  constrainClones(CloneGroups, C);
200  constrainClones(CloneGroups, ConstraintList...);
201  }
202 
203  /// Searches for clones in all previously passed statements.
204  /// \param Result Output parameter to which all created clone groups are
205  /// added.
206  /// \param ConstraintList The constraints that should be applied to the
207  // result.
208  template <typename... Ts>
209  void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
210  // The initial assumption is that there is only one clone group and every
211  // statement is a clone of the others. This clone group will then be
212  // split up with the help of the constraints.
213  CloneGroup AllClones;
214  AllClones.reserve(Sequences.size());
215  for (const auto &C : Sequences) {
216  AllClones.push_back(C);
217  }
218 
219  Result.push_back(AllClones);
220 
221  constrainClones(Result, ConstraintList...);
222  }
223 
224 private:
225  CloneGroup Sequences;
226 };
227 
228 /// This class is a utility class that contains utility functions for building
229 /// custom constraints.
231 public:
232  /// Removes all groups by using a filter function.
233  /// \param CloneGroups The list of CloneGroups that is supposed to be
234  /// filtered.
235  /// \param Filter The filter function that should return true for all groups
236  /// that should be removed from the list.
237  static void filterGroups(
238  std::vector<CloneDetector::CloneGroup> &CloneGroups,
239  llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
240  CloneGroups.erase(
241  std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter),
242  CloneGroups.end());
243  }
244 
245  /// Splits the given CloneGroups until the given Compare function returns true
246  /// for all clones in a single group.
247  /// \param CloneGroups A list of CloneGroups that should be modified.
248  /// \param Compare The comparison function that all clones are supposed to
249  /// pass. Should return true if and only if two clones belong
250  /// to the same CloneGroup.
251  static void splitCloneGroups(
252  std::vector<CloneDetector::CloneGroup> &CloneGroups,
253  llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
254  Compare);
255 };
256 
257 /// This constraint moves clones into clone groups of type II via hashing.
258 ///
259 /// Clones with different hash values are moved into separate clone groups.
260 /// Collisions are possible, and this constraint does nothing to address this
261 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
262 /// constraint chain, not necessarily immediately, to eliminate hash collisions
263 /// through a more detailed analysis.
265 public:
266  void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
267 };
268 
269 /// This constraint moves clones into clone groups of type II by comparing them.
270 ///
271 /// Clones that aren't type II clones are moved into separate clone groups.
272 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
273 /// group are guaranteed to be be type II clones of each other, but it is too
274 /// slow to efficiently handle large amounts of clones.
276 public:
277  void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
278 };
279 
280 /// Ensures that every clone has at least the given complexity.
281 ///
282 /// Complexity is here defined as the total amount of children of a statement.
283 /// This constraint assumes the first statement in the group is representative
284 /// for all other statements in the group in terms of complexity.
286  unsigned MinComplexity;
287 
288 public:
289  MinComplexityConstraint(unsigned MinComplexity)
290  : MinComplexity(MinComplexity) {}
291 
292  /// Calculates the complexity of the given StmtSequence.
293  /// \param Limit The limit of complexity we probe for. After reaching
294  /// this limit during calculation, this method is exiting
295  /// early to improve performance and returns this limit.
296  size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
297  const std::string &ParentMacroStack = "");
298 
299  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
301  CloneGroups, [this](const CloneDetector::CloneGroup &A) {
302  if (!A.empty())
303  return calculateStmtComplexity(A.front(), MinComplexity) <
304  MinComplexity;
305  else
306  return false;
307  });
308  }
309 };
310 
311 /// Ensures that all clone groups contain at least the given amount of clones.
313  unsigned MinGroupSize;
314 
315 public:
316  MinGroupSizeConstraint(unsigned MinGroupSize = 2)
317  : MinGroupSize(MinGroupSize) {}
318 
319  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
320  CloneConstraint::filterGroups(CloneGroups,
321  [this](const CloneDetector::CloneGroup &A) {
322  return A.size() < MinGroupSize;
323  });
324  }
325 };
326 
327 /// Ensures that no clone group fully contains another clone group.
329  void constrain(std::vector<CloneDetector::CloneGroup> &Result);
330 };
331 
334  std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
335 
336  FilenamePatternConstraint(StringRef IgnoredFilesPattern)
337  : IgnoredFilesPattern(IgnoredFilesPattern) {
338  IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
339  IgnoredFilesPattern.str() + "$)");
340  }
341 
342  bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
343 
344  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
346  CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
347  return isAutoGenerated(Group);
348  });
349  }
350 };
351 
352 /// Analyzes the pattern of the referenced variables in a statement.
354 
355  /// Describes an occurrence of a variable reference in a statement.
356  struct VariableOccurence {
357  /// The index of the associated VarDecl in the Variables vector.
358  size_t KindID;
359  /// The statement in the code where the variable was referenced.
360  const Stmt *Mention;
361 
362  VariableOccurence(size_t KindID, const Stmt *Mention)
363  : KindID(KindID), Mention(Mention) {}
364  };
365 
366  /// All occurrences of referenced variables in the order of appearance.
367  std::vector<VariableOccurence> Occurences;
368  /// List of referenced variables in the order of appearance.
369  /// Every item in this list is unique.
370  std::vector<const VarDecl *> Variables;
371 
372  /// Adds a new variable referenced to this pattern.
373  /// \param VarDecl The declaration of the variable that is referenced.
374  /// \param Mention The SourceRange where this variable is referenced.
375  void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
376 
377  /// Adds each referenced variable from the given statement.
378  void addVariables(const Stmt *S);
379 
380 public:
381  /// Creates an VariablePattern object with information about the given
382  /// StmtSequence.
383  VariablePattern(const StmtSequence &Sequence) {
384  for (const Stmt *S : Sequence)
385  addVariables(S);
386  }
387 
388  /// Describes two clones that reference their variables in a different pattern
389  /// which could indicate a programming error.
391  /// Utility class holding the relevant information about a single
392  /// clone in this pair.
394  /// The variable which referencing in this clone was against the pattern.
395  const VarDecl *Variable;
396  /// Where the variable was referenced.
397  const Stmt *Mention;
398  /// The variable that should have been referenced to follow the pattern.
399  /// If Suggestion is a nullptr then it's not possible to fix the pattern
400  /// by referencing a different variable in this clone.
401  const VarDecl *Suggestion;
402  SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
403  const VarDecl *Suggestion)
404  : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
406  };
407  /// The first clone in the pair which always has a suggested variable.
409  /// This other clone in the pair which can have a suggested variable.
411  };
412 
413  /// Counts the differences between this pattern and the given one.
414  /// \param Other The given VariablePattern to compare with.
415  /// \param FirstMismatch Output parameter that will be filled with information
416  /// about the first difference between the two patterns. This parameter
417  /// can be a nullptr, in which case it will be ignored.
418  /// \return Returns the number of differences between the pattern this object
419  /// is following and the given VariablePattern.
420  ///
421  /// For example, the following statements all have the same pattern and this
422  /// function would return zero:
423  ///
424  /// if (a < b) return a; return b;
425  /// if (x < y) return x; return y;
426  /// if (u2 < u1) return u2; return u1;
427  ///
428  /// But the following statement has a different pattern (note the changed
429  /// variables in the return statements) and would have two differences when
430  /// compared with one of the statements above.
431  ///
432  /// if (a < b) return b; return a;
433  ///
434  /// This function should only be called if the related statements of the given
435  /// pattern and the statements of this objects are clones of each other.
436  unsigned countPatternDifferences(
437  const VariablePattern &Other,
438  VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
439 };
440 
441 /// Ensures that all clones reference variables in the same pattern.
443  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
444 };
445 
446 } // end namespace clang
447 
448 #endif // LLVM_CLANG_AST_CLONEDETECTION_H
FilenamePatternConstraint(StringRef IgnoredFilesPattern)
SourceLocation getStartLoc() const LLVM_READONLY
Returns the start sourcelocation of the first statement in this sequence.
SourceLocation getEndLoc() const
Returns the end sourcelocation of the last statement in this sequence.
__SIZE_TYPE__ size_t
The unsigned integer type of the result of the sizeof operator.
Definition: opencl-c.h:60
Stmt - This represents one statement.
Definition: Stmt.h:66
ASTContext & getASTContext() const
Returns the related ASTContext for the stored Stmts.
SourceRange getSourceRange() const
Returns the source range of the whole sequence - from the beginning of the first statement to the end...
Analyzes the pattern of the referenced variables in a statement.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
const Stmt * front() const
Returns the first statement in this sequence.
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
bool operator!=(const StmtSequence &Other) const
bool operator==(const StmtSequence &Other) const
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
Represents a variable declaration or definition.
Definition: Decl.h:814
StmtSequence()
Constructs an empty StmtSequence.
SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, const VarDecl *Suggestion)
iterator begin() const
Returns an iterator pointing to the first statement in this sequence.
Identifies a list of statements.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:150
This class is a utility class that contains utility functions for building custom constraints...
Ensures that all clones reference variables in the same pattern.
Ensures that all clone groups contain at least the given amount of clones.
static void constrainClones(std::vector< CloneGroup > &CloneGroups, T C)
Constrains the given list of clone groups with the given constraint.
VariablePattern(const StmtSequence &Sequence)
Creates an VariablePattern object with information about the given StmtSequence.
void findClones(std::vector< CloneGroup > &Result, Ts... ConstraintList)
Searches for clones in all previously passed statements.
iterator end() const
Returns an iterator pointing behind the last statement in this sequence.
Searches for similar subtrees in the AST.
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
bool empty() const
Returns true if and only if this StmtSequence contains no statements.
Describes two clones that reference their variables in a different pattern which could indicate a pro...
CompoundStmt - This represents a group of statements like { stmt stmt }.
Definition: Stmt.h:616
llvm::SmallVector< StmtSequence, 8 > CloneGroup
A collection of StmtSequences that share an arbitrary property.
This constraint moves clones into clone groups of type II via hashing.
SuspiciousCloneInfo SecondCloneInfo
This other clone in the pair which can have a suggested variable.
const Decl * getContainingDecl() const
Returns the declaration that contains the stored Stmts.
This constraint moves clones into clone groups of type II by comparing them.
Ensures that every clone has at least the given complexity.
const VarDecl * Suggestion
The variable that should have been referenced to follow the pattern.
The result type of a method or function.
std::shared_ptr< llvm::Regex > IgnoredFilesRegex
Utility class holding the relevant information about a single clone in this pair. ...
const Stmt * Mention
Where the variable was referenced.
unsigned size() const
Returns the number of statements this object holds.
static void constrainClones(std::vector< CloneGroup > &CloneGroups, T1 C, Ts... ConstraintList)
Constrains the given list of clone groups with the given list of constraints.
MinComplexityConstraint(unsigned MinComplexity)
Encodes a location in the source.
const Stmt *const * iterator
bool holdsSequence() const
Returns true if this objects holds a list of statements.
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
Dataflow Directional Tag Classes.
const Stmt * back() const
Returns the last statement in this sequence.
SourceLocation getBeginLoc() const
MinGroupSizeConstraint(unsigned MinGroupSize=2)
bool contains(const StmtSequence &Other) const
Returns true if and only if this sequence covers a source range that contains the source range of the...
static void filterGroups(std::vector< CloneDetector::CloneGroup > &CloneGroups, llvm::function_ref< bool(const CloneDetector::CloneGroup &)> Filter)
Removes all groups by using a filter function.
SuspiciousCloneInfo FirstCloneInfo
The first clone in the pair which always has a suggested variable.
A trivial tuple used to represent a source range.
const VarDecl * Variable
The variable which referencing in this clone was against the pattern.
Ensures that no clone group fully contains another clone group.