LLVM 22.0.0git
SuffixTree.h
Go to the documentation of this file.
1//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// A data structure for fast substring queries.
9//
10// Suffix trees represent the suffixes of their input strings in their leaves.
11// A suffix tree is a type of compressed trie structure where each node
12// represents an entire substring rather than a single character. Each leaf
13// of the tree is a suffix.
14//
15// A suffix tree can be seen as a type of state machine where each state is a
16// substring of the full string. The tree is structured so that, for a string
17// of length N, there are exactly N leaves in the tree. This structure allows
18// us to quickly find repeated substrings of the input string.
19//
20// In this implementation, a "string" is a vector of unsigned integers.
21// These integers may result from hashing some data type. A suffix tree can
22// contain 1 or many strings, which can then be queried as one large string.
23//
24// The suffix tree is implemented using Ukkonen's algorithm for linear-time
25// suffix tree construction. Ukkonen's algorithm is explained in more detail
26// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
27// paper is available at
28//
29// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
30//===----------------------------------------------------------------------===//
31
32#ifndef LLVM_SUPPORT_SUFFIXTREE_H
33#define LLVM_SUPPORT_SUFFIXTREE_H
34
35#include "llvm/ADT/ArrayRef.h"
39
40namespace llvm {
42public:
43 /// Each element is an integer representing an instruction in the module.
45
46 /// Whether to consider leaf descendants or only leaf children.
48
49 /// A repeated substring in the tree.
51 /// The length of the string.
52 unsigned Length;
53
54 /// The start indices of each occurrence.
56 };
57
58private:
59 /// Maintains internal nodes in the tree.
61 /// Maintains leaf nodes in the tree.
63
64 /// The root of the suffix tree.
65 ///
66 /// The root represents the empty string. It is maintained by the
67 /// \p NodeAllocator like every other node in the tree.
68 SuffixTreeInternalNode *Root = nullptr;
69
70 /// The end index of each leaf in the tree.
71 unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
72
73 /// Helper struct which keeps track of the next insertion point in
74 /// Ukkonen's algorithm.
75 struct ActiveState {
76 /// The next node to insert at.
78
79 /// The index of the first character in the substring currently being added.
80 unsigned Idx = SuffixTreeNode::EmptyIdx;
81
82 /// The length of the substring we have to add at the current step.
83 unsigned Len = 0;
84 };
85
86 /// The point the next insertion will take place at in the
87 /// construction algorithm.
88 ActiveState Active;
89
90 /// Allocate a leaf node and add it to the tree.
91 ///
92 /// \param Parent The parent of this node.
93 /// \param StartIdx The start index of this node's associated string.
94 /// \param Edge The label on the edge leaving \p Parent to this node.
95 ///
96 /// \returns A pointer to the allocated leaf node.
97 SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx,
98 unsigned Edge);
99
100 /// Allocate an internal node and add it to the tree.
101 ///
102 /// \param Parent The parent of this node. Only null when allocating the root.
103 /// \param StartIdx The start index of this node's associated string.
104 /// \param EndIdx The end index of this node's associated string.
105 /// \param Edge The label on the edge leaving \p Parent to this node.
106 ///
107 /// \returns A pointer to the allocated internal node.
108 SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent,
109 unsigned StartIdx, unsigned EndIdx,
110 unsigned Edge);
111
112 /// Allocate the root node and add it to the tree.
113 ///
114 /// \returns A pointer to the root.
115 SuffixTreeInternalNode *insertRoot();
116
117 /// Set the suffix indices of the leaves to the start indices of their
118 /// respective suffixes.
119 void setSuffixIndices();
120
121 /// Construct the suffix tree for the prefix of the input ending at
122 /// \p EndIdx.
123 ///
124 /// Used to construct the full suffix tree iteratively. At the end of each
125 /// step, the constructed suffix tree is either a valid suffix tree, or a
126 /// suffix tree with implicit suffixes. At the end of the final step, the
127 /// suffix tree is a valid tree.
128 ///
129 /// \param EndIdx The end index of the current prefix in the main string.
130 /// \param SuffixesToAdd The number of suffixes that must be added
131 /// to complete the suffix tree at the current phase.
132 ///
133 /// \returns The number of suffixes that have not been added at the end of
134 /// this step.
135 unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
136
137 /// This vector contains all leaf nodes of this suffix tree. These leaf nodes
138 /// are identified using post-order depth-first traversal, so that the order
139 /// of these leaf nodes in the vector matches the order of the leaves in the
140 /// tree from left to right if one were to draw the tree on paper.
141 std::vector<SuffixTreeLeafNode *> LeafNodes;
142
143 /// Perform a post-order depth-first traversal of the tree and perform two
144 /// tasks during the traversal. The first is to populate LeafNodes, adding
145 /// nodes in order of the traversal. The second is to keep track of the leaf
146 /// descendants of every internal node by assigning values to LeftLeafIndex
147 /// and RightLefIndex fields of SuffixTreeNode for all internal nodes.
148 void setLeafNodes();
149
150public:
151 /// Construct a suffix tree from a sequence of unsigned integers.
152 ///
153 /// \param Str The string to construct the suffix tree for.
154 /// \param OutlinerLeafDescendants Whether to consider leaf descendants or
155 /// only leaf children (used by Machine Outliner).
157 bool OutlinerLeafDescendants = false);
158
159 /// Iterator for finding all repeated substrings in the suffix tree.
161 private:
162 /// The current node we're visiting.
163 SuffixTreeNode *N = nullptr;
164
165 /// The repeated substring associated with this node.
167
168 /// The nodes left to visit.
169 SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit;
170
171 /// The minimum length of a repeated substring to find.
172 /// Since we're outlining, we want at least two instructions in the range.
173 /// FIXME: This may not be true for targets like X86 which support many
174 /// instruction lengths.
175 const unsigned MinLength = 2;
176
177 /// Vector of leaf nodes of the suffix tree.
178 const std::vector<SuffixTreeLeafNode *> &LeafNodes;
179
180 /// Whether to consider leaf descendants or only leaf children.
181 bool OutlinerLeafDescendants = !LeafNodes.empty();
182
183 /// Move the iterator to the next repeated substring.
184 LLVM_ABI void advance();
185
186 public:
187 /// Return the current repeated substring.
188 RepeatedSubstring &operator*() { return RS; }
189
191 advance();
192 return *this;
193 }
194
197 advance();
198 return It;
199 }
200
202 return N == Other.N;
203 }
205 return !(*this == Other);
206 }
207
210 const std::vector<SuffixTreeLeafNode *> &LeafNodes = {})
211 : N(N), LeafNodes(LeafNodes) {
212 // Do we have a non-null node?
213 if (!N)
214 return;
215 // Yes. At the first step, we need to visit all of N's children.
216 // Note: This means that we visit N last.
217 InternalNodesToVisit.push_back(N);
218 advance();
219 }
220 };
221
223 iterator begin() { return iterator(Root, LeafNodes); }
224 iterator end() { return iterator(nullptr); }
225};
226
227} // namespace llvm
228
229#endif // LLVM_SUPPORT_SUFFIXTREE_H
This file defines the BumpPtrAllocator interface.
#define LLVM_ABI
Definition Compiler.h:213
#define I(x, y, z)
Definition MD5.cpp:58
static cl::opt< bool > OutlinerLeafDescendants("outliner-leaf-descendants", cl::init(true), cl::Hidden, cl::desc("Consider all leaf descendants of internal nodes of the suffix " "tree as candidates for outlining (if false, only leaf children " "are considered)"))
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition Allocator.h:390
LLVM_ABI SuffixTree(const ArrayRef< unsigned > &Str, bool OutlinerLeafDescendants=false)
Construct a suffix tree from a sequence of unsigned integers.
iterator begin()
Definition SuffixTree.h:223
iterator end()
Definition SuffixTree.h:224
ArrayRef< unsigned > Str
Each element is an integer representing an instruction in the module.
Definition SuffixTree.h:44
RepeatedSubstringIterator iterator
Definition SuffixTree.h:222
bool OutlinerLeafDescendants
Whether to consider leaf descendants or only leaf children.
Definition SuffixTree.h:47
This is an optimization pass for GlobalISel generic memory operations.
@ Other
Any other memory.
Definition ModRef.h:68
#define N
A node in a suffix tree which represents a substring or suffix.
static const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
Iterator for finding all repeated substrings in the suffix tree.
Definition SuffixTree.h:160
RepeatedSubstringIterator operator++(int I)
Definition SuffixTree.h:195
RepeatedSubstringIterator(SuffixTreeInternalNode *N, const std::vector< SuffixTreeLeafNode * > &LeafNodes={})
Definition SuffixTree.h:208
bool operator!=(const RepeatedSubstringIterator &Other) const
Definition SuffixTree.h:204
RepeatedSubstring & operator*()
Return the current repeated substring.
Definition SuffixTree.h:188
bool operator==(const RepeatedSubstringIterator &Other) const
Definition SuffixTree.h:201
RepeatedSubstringIterator & operator++()
Definition SuffixTree.h:190
A repeated substring in the tree.
Definition SuffixTree.h:50
SmallVector< unsigned > StartIndices
The start indices of each occurrence.
Definition SuffixTree.h:55
unsigned Length
The length of the string.
Definition SuffixTree.h:52