LLVM 20.0.0git
SuffixTree.h
Go to the documentation of this file.
1//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// A data structure for fast substring queries.
9//
10// Suffix trees represent the suffixes of their input strings in their leaves.
11// A suffix tree is a type of compressed trie structure where each node
12// represents an entire substring rather than a single character. Each leaf
13// of the tree is a suffix.
14//
15// A suffix tree can be seen as a type of state machine where each state is a
16// substring of the full string. The tree is structured so that, for a string
17// of length N, there are exactly N leaves in the tree. This structure allows
18// us to quickly find repeated substrings of the input string.
19//
20// In this implementation, a "string" is a vector of unsigned integers.
21// These integers may result from hashing some data type. A suffix tree can
22// contain 1 or many strings, which can then be queried as one large string.
23//
24// The suffix tree is implemented using Ukkonen's algorithm for linear-time
25// suffix tree construction. Ukkonen's algorithm is explained in more detail
26// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
27// paper is available at
28//
29// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
30//===----------------------------------------------------------------------===//
31
32#ifndef LLVM_SUPPORT_SUFFIXTREE_H
33#define LLVM_SUPPORT_SUFFIXTREE_H
34
35#include "llvm/ADT/ArrayRef.h"
38
39namespace llvm {
41public:
42 /// Each element is an integer representing an instruction in the module.
44
45 /// Whether to consider leaf descendants or only leaf children.
47
48 /// A repeated substring in the tree.
50 /// The length of the string.
51 unsigned Length;
52
53 /// The start indices of each occurrence.
55 };
56
57private:
58 /// Maintains internal nodes in the tree.
60 /// Maintains leaf nodes in the tree.
62
63 /// The root of the suffix tree.
64 ///
65 /// The root represents the empty string. It is maintained by the
66 /// \p NodeAllocator like every other node in the tree.
67 SuffixTreeInternalNode *Root = nullptr;
68
69 /// The end index of each leaf in the tree.
70 unsigned LeafEndIdx = SuffixTreeNode::EmptyIdx;
71
72 /// Helper struct which keeps track of the next insertion point in
73 /// Ukkonen's algorithm.
74 struct ActiveState {
75 /// The next node to insert at.
77
78 /// The index of the first character in the substring currently being added.
80
81 /// The length of the substring we have to add at the current step.
82 unsigned Len = 0;
83 };
84
85 /// The point the next insertion will take place at in the
86 /// construction algorithm.
87 ActiveState Active;
88
89 /// Allocate a leaf node and add it to the tree.
90 ///
91 /// \param Parent The parent of this node.
92 /// \param StartIdx The start index of this node's associated string.
93 /// \param Edge The label on the edge leaving \p Parent to this node.
94 ///
95 /// \returns A pointer to the allocated leaf node.
96 SuffixTreeNode *insertLeaf(SuffixTreeInternalNode &Parent, unsigned StartIdx,
97 unsigned Edge);
98
99 /// Allocate an internal node and add it to the tree.
100 ///
101 /// \param Parent The parent of this node. Only null when allocating the root.
102 /// \param StartIdx The start index of this node's associated string.
103 /// \param EndIdx The end index of this node's associated string.
104 /// \param Edge The label on the edge leaving \p Parent to this node.
105 ///
106 /// \returns A pointer to the allocated internal node.
107 SuffixTreeInternalNode *insertInternalNode(SuffixTreeInternalNode *Parent,
108 unsigned StartIdx, unsigned EndIdx,
109 unsigned Edge);
110
111 /// Allocate the root node and add it to the tree.
112 ///
113 /// \returns A pointer to the root.
114 SuffixTreeInternalNode *insertRoot();
115
116 /// Set the suffix indices of the leaves to the start indices of their
117 /// respective suffixes.
118 void setSuffixIndices();
119
120 /// Construct the suffix tree for the prefix of the input ending at
121 /// \p EndIdx.
122 ///
123 /// Used to construct the full suffix tree iteratively. At the end of each
124 /// step, the constructed suffix tree is either a valid suffix tree, or a
125 /// suffix tree with implicit suffixes. At the end of the final step, the
126 /// suffix tree is a valid tree.
127 ///
128 /// \param EndIdx The end index of the current prefix in the main string.
129 /// \param SuffixesToAdd The number of suffixes that must be added
130 /// to complete the suffix tree at the current phase.
131 ///
132 /// \returns The number of suffixes that have not been added at the end of
133 /// this step.
134 unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
135
136 /// This vector contains all leaf nodes of this suffix tree. These leaf nodes
137 /// are identified using post-order depth-first traversal, so that the order
138 /// of these leaf nodes in the vector matches the order of the leaves in the
139 /// tree from left to right if one were to draw the tree on paper.
140 std::vector<SuffixTreeLeafNode *> LeafNodes;
141
142 /// Perform a post-order depth-first traversal of the tree and perform two
143 /// tasks during the traversal. The first is to populate LeafNodes, adding
144 /// nodes in order of the traversal. The second is to keep track of the leaf
145 /// descendants of every internal node by assigning values to LeftLeafIndex
146 /// and RightLefIndex fields of SuffixTreeNode for all internal nodes.
147 void setLeafNodes();
148
149public:
150 /// Construct a suffix tree from a sequence of unsigned integers.
151 ///
152 /// \param Str The string to construct the suffix tree for.
153 /// \param OutlinerLeafDescendants Whether to consider leaf descendants or
154 /// only leaf children (used by Machine Outliner).
156 bool OutlinerLeafDescendants = false);
157
158 /// Iterator for finding all repeated substrings in the suffix tree.
160 private:
161 /// The current node we're visiting.
162 SuffixTreeNode *N = nullptr;
163
164 /// The repeated substring associated with this node.
166
167 /// The nodes left to visit.
168 SmallVector<SuffixTreeInternalNode *> InternalNodesToVisit;
169
170 /// The minimum length of a repeated substring to find.
171 /// Since we're outlining, we want at least two instructions in the range.
172 /// FIXME: This may not be true for targets like X86 which support many
173 /// instruction lengths.
174 const unsigned MinLength = 2;
175
176 /// Vector of leaf nodes of the suffix tree.
177 const std::vector<SuffixTreeLeafNode *> &LeafNodes;
178
179 /// Whether to consider leaf descendants or only leaf children.
180 bool OutlinerLeafDescendants = !LeafNodes.empty();
181
182 /// Move the iterator to the next repeated substring.
183 void advance();
184
185 public:
186 /// Return the current repeated substring.
187 RepeatedSubstring &operator*() { return RS; }
188
190 advance();
191 return *this;
192 }
193
196 advance();
197 return It;
198 }
199
201 return N == Other.N;
202 }
204 return !(*this == Other);
205 }
206
209 const std::vector<SuffixTreeLeafNode *> &LeafNodes = {})
210 : N(N), LeafNodes(LeafNodes) {
211 // Do we have a non-null node?
212 if (!N)
213 return;
214 // Yes. At the first step, we need to visit all of N's children.
215 // Note: This means that we visit N last.
216 InternalNodesToVisit.push_back(N);
217 advance();
218 }
219 };
220
222 iterator begin() { return iterator(Root, LeafNodes); }
223 iterator end() { return iterator(nullptr); }
224};
225
226} // namespace llvm
227
228#endif // LLVM_SUPPORT_SUFFIXTREE_H
This file defines the BumpPtrAllocator interface.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define I(x, y, z)
Definition: MD5.cpp:58
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
A BumpPtrAllocator that allows only elements of a specific type to be allocated.
Definition: Allocator.h:389
iterator begin()
Definition: SuffixTree.h:222
iterator end()
Definition: SuffixTree.h:223
ArrayRef< unsigned > Str
Each element is an integer representing an instruction in the module.
Definition: SuffixTree.h:43
RepeatedSubstringIterator iterator
Definition: SuffixTree.h:221
bool OutlinerLeafDescendants
Whether to consider leaf descendants or only leaf children.
Definition: SuffixTree.h:46
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Other
Any other memory.
#define N
A node in a suffix tree which represents a substring or suffix.
static const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
Iterator for finding all repeated substrings in the suffix tree.
Definition: SuffixTree.h:159
RepeatedSubstringIterator operator++(int I)
Definition: SuffixTree.h:194
RepeatedSubstringIterator(SuffixTreeInternalNode *N, const std::vector< SuffixTreeLeafNode * > &LeafNodes={})
Definition: SuffixTree.h:207
bool operator!=(const RepeatedSubstringIterator &Other) const
Definition: SuffixTree.h:203
RepeatedSubstring & operator*()
Return the current repeated substring.
Definition: SuffixTree.h:187
bool operator==(const RepeatedSubstringIterator &Other) const
Definition: SuffixTree.h:200
RepeatedSubstringIterator & operator++()
Definition: SuffixTree.h:189
A repeated substring in the tree.
Definition: SuffixTree.h:49
SmallVector< unsigned > StartIndices
The start indices of each occurrence.
Definition: SuffixTree.h:54
unsigned Length
The length of the string.
Definition: SuffixTree.h:51