LLVM  13.0.0git
SuffixTree.h
Go to the documentation of this file.
1 //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the Suffix Tree class and Suffix Tree Node struct.
10 //
11 //===----------------------------------------------------------------------===//
12 #ifndef LLVM_SUPPORT_SUFFIXTREE_H
13 #define LLVM_SUPPORT_SUFFIXTREE_H
14 
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/Support/Allocator.h"
18 #include <vector>
19 
20 namespace llvm {
21 
22 /// Represents an undefined index in the suffix tree.
23 const unsigned EmptyIdx = -1;
24 
25 /// A node in a suffix tree which represents a substring or suffix.
26 ///
27 /// Each node has either no children or at least two children, with the root
28 /// being a exception in the empty tree.
29 ///
30 /// Children are represented as a map between unsigned integers and nodes. If
31 /// a node N has a child M on unsigned integer k, then the mapping represented
32 /// by N is a proper prefix of the mapping represented by M. Note that this,
33 /// although similar to a trie is somewhat different: each node stores a full
34 /// substring of the full mapping rather than a single character state.
35 ///
36 /// Each internal node contains a pointer to the internal node representing
37 /// the same string, but with the first character chopped off. This is stored
38 /// in \p Link. Each leaf node stores the start index of its respective
39 /// suffix in \p SuffixIdx.
41 
42  /// The children of this node.
43  ///
44  /// A child existing on an unsigned integer implies that from the mapping
45  /// represented by the current node, there is a way to reach another
46  /// mapping by tacking that character on the end of the current string.
48 
49  /// The start index of this node's substring in the main string.
50  unsigned StartIdx = EmptyIdx;
51 
52  /// The end index of this node's substring in the main string.
53  ///
54  /// Every leaf node must have its \p EndIdx incremented at the end of every
55  /// step in the construction algorithm. To avoid having to update O(N)
56  /// nodes individually at the end of every step, the end index is stored
57  /// as a pointer.
58  unsigned *EndIdx = nullptr;
59 
60  /// For leaves, the start index of the suffix represented by this node.
61  ///
62  /// For all other nodes, this is ignored.
63  unsigned SuffixIdx = EmptyIdx;
64 
65  /// For internal nodes, a pointer to the internal node representing
66  /// the same sequence with the first character chopped off.
67  ///
68  /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
69  /// Ukkonen's algorithm does to achieve linear-time construction is
70  /// keep track of which node the next insert should be at. This makes each
71  /// insert O(1), and there are a total of O(N) inserts. The suffix link
72  /// helps with inserting children of internal nodes.
73  ///
74  /// Say we add a child to an internal node with associated mapping S. The
75  /// next insertion must be at the node representing S - its first character.
76  /// This is given by the way that we iteratively build the tree in Ukkonen's
77  /// algorithm. The main idea is to look at the suffixes of each prefix in the
78  /// string, starting with the longest suffix of the prefix, and ending with
79  /// the shortest. Therefore, if we keep pointers between such nodes, we can
80  /// move to the next insertion point in O(1) time. If we don't, then we'd
81  /// have to query from the root, which takes O(N) time. This would make the
82  /// construction algorithm O(N^2) rather than O(N).
83  SuffixTreeNode *Link = nullptr;
84 
85  /// The length of the string formed by concatenating the edge labels from the
86  /// root to this node.
87  unsigned ConcatLen = 0;
88 
89  /// Returns true if this node is a leaf.
90  bool isLeaf() const { return SuffixIdx != EmptyIdx; }
91 
92  /// Returns true if this node is the root of its owning \p SuffixTree.
93  bool isRoot() const { return StartIdx == EmptyIdx; }
94 
95  /// Return the number of elements in the substring associated with this node.
96  size_t size() const {
97 
98  // Is it the root? If so, it's the empty string so return 0.
99  if (isRoot())
100  return 0;
101 
102  assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
103 
104  // Size = the number of elements in the string.
105  // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
106  return *EndIdx - StartIdx + 1;
107  }
108 
111 
113 };
114 
115 /// A data structure for fast substring queries.
116 ///
117 /// Suffix trees represent the suffixes of their input strings in their leaves.
118 /// A suffix tree is a type of compressed trie structure where each node
119 /// represents an entire substring rather than a single character. Each leaf
120 /// of the tree is a suffix.
121 ///
122 /// A suffix tree can be seen as a type of state machine where each state is a
123 /// substring of the full string. The tree is structured so that, for a string
124 /// of length N, there are exactly N leaves in the tree. This structure allows
125 /// us to quickly find repeated substrings of the input string.
126 ///
127 /// In this implementation, a "string" is a vector of unsigned integers.
128 /// These integers may result from hashing some data type. A suffix tree can
129 /// contain 1 or many strings, which can then be queried as one large string.
130 ///
131 /// The suffix tree is implemented using Ukkonen's algorithm for linear-time
132 /// suffix tree construction. Ukkonen's algorithm is explained in more detail
133 /// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
134 /// paper is available at
135 ///
136 /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
137 class SuffixTree {
138 public:
139  /// Each element is an integer representing an instruction in the module.
141 
142  /// A repeated substring in the tree.
144  /// The length of the string.
145  unsigned Length;
146 
147  /// The start indices of each occurrence.
148  std::vector<unsigned> StartIndices;
149  };
150 
151 private:
152  /// Maintains each node in the tree.
154 
155  /// The root of the suffix tree.
156  ///
157  /// The root represents the empty string. It is maintained by the
158  /// \p NodeAllocator like every other node in the tree.
159  SuffixTreeNode *Root = nullptr;
160 
161  /// Maintains the end indices of the internal nodes in the tree.
162  ///
163  /// Each internal node is guaranteed to never have its end index change
164  /// during the construction algorithm; however, leaves must be updated at
165  /// every step. Therefore, we need to store leaf end indices by reference
166  /// to avoid updating O(N) leaves at every step of construction. Thus,
167  /// every internal node must be allocated its own end index.
168  llvm::BumpPtrAllocator InternalEndIdxAllocator;
169 
170  /// The end index of each leaf in the tree.
171  unsigned LeafEndIdx = -1;
172 
173  /// Helper struct which keeps track of the next insertion point in
174  /// Ukkonen's algorithm.
175  struct ActiveState {
176  /// The next node to insert at.
177  SuffixTreeNode *Node = nullptr;
178 
179  /// The index of the first character in the substring currently being added.
180  unsigned Idx = EmptyIdx;
181 
182  /// The length of the substring we have to add at the current step.
183  unsigned Len = 0;
184  };
185 
186  /// The point the next insertion will take place at in the
187  /// construction algorithm.
188  ActiveState Active;
189 
190  /// Allocate a leaf node and add it to the tree.
191  ///
192  /// \param Parent The parent of this node.
193  /// \param StartIdx The start index of this node's associated string.
194  /// \param Edge The label on the edge leaving \p Parent to this node.
195  ///
196  /// \returns A pointer to the allocated leaf node.
197  SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
198  unsigned Edge);
199 
200  /// Allocate an internal node and add it to the tree.
201  ///
202  /// \param Parent The parent of this node. Only null when allocating the root.
203  /// \param StartIdx The start index of this node's associated string.
204  /// \param EndIdx The end index of this node's associated string.
205  /// \param Edge The label on the edge leaving \p Parent to this node.
206  ///
207  /// \returns A pointer to the allocated internal node.
208  SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
209  unsigned EndIdx, unsigned Edge);
210 
211  /// Set the suffix indices of the leaves to the start indices of their
212  /// respective suffixes.
213  void setSuffixIndices();
214 
215  /// Construct the suffix tree for the prefix of the input ending at
216  /// \p EndIdx.
217  ///
218  /// Used to construct the full suffix tree iteratively. At the end of each
219  /// step, the constructed suffix tree is either a valid suffix tree, or a
220  /// suffix tree with implicit suffixes. At the end of the final step, the
221  /// suffix tree is a valid tree.
222  ///
223  /// \param EndIdx The end index of the current prefix in the main string.
224  /// \param SuffixesToAdd The number of suffixes that must be added
225  /// to complete the suffix tree at the current phase.
226  ///
227  /// \returns The number of suffixes that have not been added at the end of
228  /// this step.
229  unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
230 
231 public:
232  /// Construct a suffix tree from a sequence of unsigned integers.
233  ///
234  /// \param Str The string to construct the suffix tree for.
235  SuffixTree(const std::vector<unsigned> &Str);
236 
237  /// Iterator for finding all repeated substrings in the suffix tree.
239  private:
240  /// The current node we're visiting.
241  SuffixTreeNode *N = nullptr;
242 
243  /// The repeated substring associated with this node.
245 
246  /// The nodes left to visit.
247  std::vector<SuffixTreeNode *> ToVisit;
248 
249  /// The minimum length of a repeated substring to find.
250  /// Since we're outlining, we want at least two instructions in the range.
251  /// FIXME: This may not be true for targets like X86 which support many
252  /// instruction lengths.
253  const unsigned MinLength = 2;
254 
255  /// Move the iterator to the next repeated substring.
256  void advance() {
257  // Clear the current state. If we're at the end of the range, then this
258  // is the state we want to be in.
259  RS = RepeatedSubstring();
260  N = nullptr;
261 
262  // Each leaf node represents a repeat of a string.
263  std::vector<SuffixTreeNode *> LeafChildren;
264 
265  // Continue visiting nodes until we find one which repeats more than once.
266  while (!ToVisit.empty()) {
267  SuffixTreeNode *Curr = ToVisit.back();
268  ToVisit.pop_back();
269  LeafChildren.clear();
270 
271  // Keep track of the length of the string associated with the node. If
272  // it's too short, we'll quit.
273  unsigned Length = Curr->ConcatLen;
274 
275  // Iterate over each child, saving internal nodes for visiting, and
276  // leaf nodes in LeafChildren. Internal nodes represent individual
277  // strings, which may repeat.
278  for (auto &ChildPair : Curr->Children) {
279  // Save all of this node's children for processing.
280  if (!ChildPair.second->isLeaf())
281  ToVisit.push_back(ChildPair.second);
282 
283  // It's not an internal node, so it must be a leaf. If we have a
284  // long enough string, then save the leaf children.
285  else if (Length >= MinLength)
286  LeafChildren.push_back(ChildPair.second);
287  }
288 
289  // The root never represents a repeated substring. If we're looking at
290  // that, then skip it.
291  if (Curr->isRoot())
292  continue;
293 
294  // Do we have any repeated substrings?
295  if (LeafChildren.size() >= 2) {
296  // Yes. Update the state to reflect this, and then bail out.
297  N = Curr;
298  RS.Length = Length;
299  for (SuffixTreeNode *Leaf : LeafChildren)
300  RS.StartIndices.push_back(Leaf->SuffixIdx);
301  break;
302  }
303  }
304 
305  // At this point, either NewRS is an empty RepeatedSubstring, or it was
306  // set in the above loop. Similarly, N is either nullptr, or the node
307  // associated with NewRS.
308  }
309 
310  public:
311  /// Return the current repeated substring.
312  RepeatedSubstring &operator*() { return RS; }
313 
315  advance();
316  return *this;
317  }
318 
320  RepeatedSubstringIterator It(*this);
321  advance();
322  return It;
323  }
324 
325  bool operator==(const RepeatedSubstringIterator &Other) const {
326  return N == Other.N;
327  }
328  bool operator!=(const RepeatedSubstringIterator &Other) const {
329  return !(*this == Other);
330  }
331 
333  // Do we have a non-null node?
334  if (N) {
335  // Yes. At the first step, we need to visit all of N's children.
336  // Note: This means that we visit N last.
337  ToVisit.push_back(N);
338  advance();
339  }
340  }
341  };
342 
344  iterator begin() { return iterator(Root); }
345  iterator end() { return iterator(nullptr); }
346 };
347 
348 } // namespace llvm
349 
350 #endif // LLVM_SUPPORT_SUFFIXTREE_H
llvm::SuffixTree::RepeatedSubstringIterator::operator++
RepeatedSubstringIterator operator++(int I)
Definition: SuffixTree.h:319
llvm::SuffixTree::RepeatedSubstring::Length
unsigned Length
The length of the string.
Definition: SuffixTree.h:145
llvm
This class represents lattice values for constants.
Definition: AllocatorList.h:23
llvm::SuffixTree::begin
iterator begin()
Definition: SuffixTree.h:344
Allocator.h
llvm::SuffixTree::RepeatedSubstring::StartIndices
std::vector< unsigned > StartIndices
The start indices of each occurrence.
Definition: SuffixTree.h:148
llvm::SpecificBumpPtrAllocator
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:376
DenseMap.h
llvm::SuffixTree
A data structure for fast substring queries.
Definition: SuffixTree.h:137
llvm::SuffixTree::SuffixTree
SuffixTree(const std::vector< unsigned > &Str)
Construct a suffix tree from a sequence of unsigned integers.
Definition: SuffixTree.cpp:19
llvm::SuffixTree::RepeatedSubstringIterator::operator==
bool operator==(const RepeatedSubstringIterator &Other) const
Definition: SuffixTree.h:325
llvm::EmptyIdx
const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
Definition: SuffixTree.h:23
llvm::SuffixTreeNode::EndIdx
unsigned * EndIdx
The end index of this node's substring in the main string.
Definition: SuffixTree.h:58
llvm::SuffixTree::RepeatedSubstringIterator
Iterator for finding all repeated substrings in the suffix tree.
Definition: SuffixTree.h:238
llvm::SuffixTree::RepeatedSubstringIterator::operator!=
bool operator!=(const RepeatedSubstringIterator &Other) const
Definition: SuffixTree.h:328
llvm::SuffixTree::RepeatedSubstringIterator::operator*
RepeatedSubstring & operator*()
Return the current repeated substring.
Definition: SuffixTree.h:312
llvm::SuffixTreeNode::SuffixTreeNode
SuffixTreeNode()
Definition: SuffixTree.h:112
llvm::SuffixTreeNode::SuffixTreeNode
SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
Definition: SuffixTree.h:109
llvm::BumpPtrAllocatorImpl
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:67
llvm::SuffixTree::RepeatedSubstring
A repeated substring in the tree.
Definition: SuffixTree.h:143
llvm::SuffixTree::iterator
RepeatedSubstringIterator iterator
Definition: SuffixTree.h:343
llvm::DenseMap
Definition: DenseMap.h:714
llvm::SuffixTreeNode::Link
SuffixTreeNode * Link
For internal nodes, a pointer to the internal node representing the same sequence with the first char...
Definition: SuffixTree.h:83
llvm::SuffixTree::Str
llvm::ArrayRef< unsigned > Str
Each element is an integer representing an instruction in the module.
Definition: SuffixTree.h:140
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::SuffixTree::end
iterator end()
Definition: SuffixTree.h:345
ArrayRef.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::SuffixTreeNode::SuffixIdx
unsigned SuffixIdx
For leaves, the start index of the suffix represented by this node.
Definition: SuffixTree.h:63
llvm::SuffixTreeNode::Children
llvm::DenseMap< unsigned, SuffixTreeNode * > Children
The children of this node.
Definition: SuffixTree.h:47
llvm::SuffixTreeNode
A node in a suffix tree which represents a substring or suffix.
Definition: SuffixTree.h:40
llvm::ArrayRef< unsigned >
llvm::SuffixTreeNode::isRoot
bool isRoot() const
Returns true if this node is the root of its owning SuffixTree.
Definition: SuffixTree.h:93
Node
Definition: ItaniumDemangle.h:114
llvm::SuffixTreeNode::StartIdx
unsigned StartIdx
The start index of this node's substring in the main string.
Definition: SuffixTree.h:50
llvm::SuffixTreeNode::isLeaf
bool isLeaf() const
Returns true if this node is a leaf.
Definition: SuffixTree.h:90
llvm::SuffixTreeNode::ConcatLen
unsigned ConcatLen
The length of the string formed by concatenating the edge labels from the root to this node.
Definition: SuffixTree.h:87
llvm::SuffixTreeNode::size
size_t size() const
Return the number of elements in the substring associated with this node.
Definition: SuffixTree.h:96
N
#define N
llvm::SuffixTree::RepeatedSubstringIterator::RepeatedSubstringIterator
RepeatedSubstringIterator(SuffixTreeNode *N)
Definition: SuffixTree.h:332
llvm::SuffixTree::RepeatedSubstringIterator::operator++
RepeatedSubstringIterator & operator++()
Definition: SuffixTree.h:314
Other
Optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1131