LLVM 22.0.0git
SuffixTree.cpp
Go to the documentation of this file.
1//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Suffix Tree class.
10//
11//===----------------------------------------------------------------------===//
12
17
18using namespace llvm;
19
20/// \returns the number of elements in the substring associated with \p N.
22 assert(N && "Got a null node?");
23 if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N))
24 if (Internal->isRoot())
25 return 0;
26 return N->getEndIdx() - N->getStartIdx() + 1;
27}
28
32 Root = insertRoot();
33 Active.Node = Root;
34
35 // Keep track of the number of suffixes we have to add of the current
36 // prefix.
37 unsigned SuffixesToAdd = 0;
38
39 // Construct the suffix tree iteratively on each prefix of the string.
40 // PfxEndIdx is the end index of the current prefix.
41 // End is one past the last element in the string.
42 for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
43 SuffixesToAdd++;
44 LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
45 SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
46 }
47
48 // Set the suffix indices of each leaf.
49 assert(Root && "Root node can't be nullptr!");
50 setSuffixIndices();
51
52 // Collect all leaf nodes of the suffix tree. And for each internal node,
53 // record the range of leaf nodes that are descendants of it.
55 setLeafNodes();
56}
57
58SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent,
59 unsigned StartIdx, unsigned Edge) {
60 assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
61 auto *N = new (LeafNodeAllocator.Allocate())
62 SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
63 Parent.Children[Edge] = N;
64 return N;
65}
66
68SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent,
69 unsigned StartIdx, unsigned EndIdx,
70 unsigned Edge) {
71 assert(StartIdx <= EndIdx && "String can't start after it ends!");
72 assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) &&
73 "Non-root internal nodes must have parents!");
74 auto *N = new (InternalNodeAllocator.Allocate())
75 SuffixTreeInternalNode(StartIdx, EndIdx, Root);
76 if (Parent)
77 Parent->Children[Edge] = N;
78 return N;
79}
80
81SuffixTreeInternalNode *SuffixTree::insertRoot() {
82 return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx,
83 SuffixTreeNode::EmptyIdx, /*Edge = */ 0);
84}
85
86void SuffixTree::setSuffixIndices() {
87 // List of nodes we need to visit along with the current length of the
88 // string.
90
91 // Current node being visited.
92 SuffixTreeNode *CurrNode = Root;
93
94 // Sum of the lengths of the nodes down the path to the current one.
95 unsigned CurrNodeLen = 0;
96 ToVisit.push_back({CurrNode, CurrNodeLen});
97 while (!ToVisit.empty()) {
98 std::tie(CurrNode, CurrNodeLen) = ToVisit.pop_back_val();
99 // Length of the current node from the root down to here.
100 CurrNode->setConcatLen(CurrNodeLen);
101 if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode))
102 for (auto &ChildPair : InternalNode->Children) {
103 assert(ChildPair.second && "Node had a null child!");
104 ToVisit.push_back(
105 {ChildPair.second,
106 CurrNodeLen + numElementsInSubstring(ChildPair.second)});
107 }
108 // No children, so we are at the end of the string.
109 if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode))
110 LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
111 }
112}
113
114void SuffixTree::setLeafNodes() {
115 // A stack that keeps track of nodes to visit for post-order DFS traversal.
117 ToVisit.push_back(Root);
118
119 // This keeps track of the index of the next leaf node to be added to
120 // the LeafNodes vector of the suffix tree.
121 unsigned LeafCounter = 0;
122
123 // This keeps track of nodes whose children have been added to the stack.
124 // The value is a pair, representing a node's first and last children.
125 DenseMap<SuffixTreeInternalNode *,
126 std::pair<SuffixTreeNode *, SuffixTreeNode *>>
127 ChildrenMap;
128
129 // Traverse the tree in post-order.
130 while (!ToVisit.empty()) {
131 SuffixTreeNode *CurrNode = ToVisit.pop_back_val();
132 if (auto *CurrInternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode)) {
133 // The current node is an internal node.
134 auto I = ChildrenMap.find(CurrInternalNode);
135 if (I == ChildrenMap.end()) {
136 // This is the first time we visit this node.
137 // Its children have not been added to the stack yet.
138 // We add current node back, and add its children to the stack.
139 // We keep track of the first and last children of the current node.
140 auto J = CurrInternalNode->Children.begin();
141 if (J != CurrInternalNode->Children.end()) {
142 ToVisit.push_back(CurrNode);
143 SuffixTreeNode *FirstChild = J->second;
144 SuffixTreeNode *LastChild = nullptr;
145 for (; J != CurrInternalNode->Children.end(); ++J) {
146 LastChild = J->second;
147 ToVisit.push_back(LastChild);
148 }
149 ChildrenMap[CurrInternalNode] = {FirstChild, LastChild};
150 }
151 } else {
152 // This is the second time we visit this node.
153 // All of its children have already been processed.
154 // Now, we can set its LeftLeafIdx and RightLeafIdx;
155 auto [FirstChild, LastChild] = I->second;
156 // Get the first child to use its RightLeafIdx.
157 // The first child is the first one added to the stack, so it is
158 // the last one to be processed. Hence, the leaf descendants
159 // of the first child are assigned the largest index numbers.
160 CurrNode->setRightLeafIdx(FirstChild->getRightLeafIdx());
161 // Get the last child to use its LeftLeafIdx.
162 CurrNode->setLeftLeafIdx(LastChild->getLeftLeafIdx());
163 assert(CurrNode->getLeftLeafIdx() <= CurrNode->getRightLeafIdx() &&
164 "LeftLeafIdx should not be larger than RightLeafIdx");
165 }
166 } else {
167 // The current node is a leaf node.
168 // We can simply set its LeftLeafIdx and RightLeafIdx.
169 CurrNode->setLeftLeafIdx(LeafCounter);
170 CurrNode->setRightLeafIdx(LeafCounter);
171 ++LeafCounter;
172 auto *CurrLeafNode = cast<SuffixTreeLeafNode>(CurrNode);
173 LeafNodes.push_back(CurrLeafNode);
174 }
175 }
176}
177
178unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
179 SuffixTreeInternalNode *NeedsLink = nullptr;
180
181 while (SuffixesToAdd > 0) {
182
183 // Are we waiting to add anything other than just the last character?
184 if (Active.Len == 0) {
185 // If not, then say the active index is the end index.
186 Active.Idx = EndIdx;
187 }
188
189 assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
190
191 // The first character in the current substring we're looking at.
192 unsigned FirstChar = Str[Active.Idx];
193
194 // Have we inserted anything starting with FirstChar at the current node?
195 if (auto It = Active.Node->Children.find(FirstChar);
196 It == Active.Node->Children.end()) {
197 // If not, then we can just insert a leaf and move to the next step.
198 insertLeaf(*Active.Node, EndIdx, FirstChar);
199
200 // The active node is an internal node, and we visited it, so it must
201 // need a link if it doesn't have one.
202 if (NeedsLink) {
203 NeedsLink->setLink(Active.Node);
204 NeedsLink = nullptr;
205 }
206 } else {
207 // There's a match with FirstChar, so look for the point in the tree to
208 // insert a new node.
209 SuffixTreeNode *NextNode = It->second;
210
211 unsigned SubstringLen = numElementsInSubstring(NextNode);
212
213 // Is the current suffix we're trying to insert longer than the size of
214 // the child we want to move to?
215 if (Active.Len >= SubstringLen) {
216 // If yes, then consume the characters we've seen and move to the next
217 // node.
219 "Expected an internal node?");
220 Active.Idx += SubstringLen;
221 Active.Len -= SubstringLen;
222 Active.Node = cast<SuffixTreeInternalNode>(NextNode);
223 continue;
224 }
225
226 // Otherwise, the suffix we're trying to insert must be contained in the
227 // next node we want to move to.
228 unsigned LastChar = Str[EndIdx];
229
230 // Is the string we're trying to insert a substring of the next node?
231 if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
232 // If yes, then we're done for this step. Remember our insertion point
233 // and move to the next end index. At this point, we have an implicit
234 // suffix tree.
235 if (NeedsLink && !Active.Node->isRoot()) {
236 NeedsLink->setLink(Active.Node);
237 NeedsLink = nullptr;
238 }
239
240 Active.Len++;
241 break;
242 }
243
244 // The string we're trying to insert isn't a substring of the next node,
245 // but matches up to a point. Split the node.
246 //
247 // For example, say we ended our search at a node n and we're trying to
248 // insert ABD. Then we'll create a new node s for AB, reduce n to just
249 // representing C, and insert a new leaf node l to represent d. This
250 // allows us to ensure that if n was a leaf, it remains a leaf.
251 //
252 // | ABC ---split---> | AB
253 // n s
254 // C / \ D
255 // n l
256
257 // The node s from the diagram
258 SuffixTreeInternalNode *SplitNode = insertInternalNode(
259 Active.Node, NextNode->getStartIdx(),
260 NextNode->getStartIdx() + Active.Len - 1, FirstChar);
261
262 // Insert the new node representing the new substring into the tree as
263 // a child of the split node. This is the node l from the diagram.
264 insertLeaf(*SplitNode, EndIdx, LastChar);
265
266 // Make the old node a child of the split node and update its start
267 // index. This is the node n from the diagram.
268 NextNode->incrementStartIdx(Active.Len);
269 SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
270
271 // SplitNode is an internal node, update the suffix link.
272 if (NeedsLink)
273 NeedsLink->setLink(SplitNode);
274
275 NeedsLink = SplitNode;
276 }
277
278 // We've added something new to the tree, so there's one less suffix to
279 // add.
280 SuffixesToAdd--;
281
282 if (Active.Node->isRoot()) {
283 if (Active.Len > 0) {
284 Active.Len--;
285 Active.Idx = EndIdx - SuffixesToAdd + 1;
286 }
287 } else {
288 // Start the next phase at the next smallest suffix.
289 Active.Node = Active.Node->getLink();
290 }
291 }
292
293 return SuffixesToAdd;
294}
295
296void SuffixTree::RepeatedSubstringIterator::advance() {
297 // Clear the current state. If we're at the end of the range, then this
298 // is the state we want to be in.
299 RS = RepeatedSubstring();
300 N = nullptr;
301
302 // Each leaf node represents a repeat of a string.
303 SmallVector<unsigned> RepeatedSubstringStarts;
304
305 // Continue visiting nodes until we find one which repeats more than once.
306 while (!InternalNodesToVisit.empty()) {
307 RepeatedSubstringStarts.clear();
308 auto *Curr = InternalNodesToVisit.pop_back_val();
309
310 // Keep track of the length of the string associated with the node. If
311 // it's too short, we'll quit.
312 unsigned Length = Curr->getConcatLen();
313
314 // Iterate over each child, saving internal nodes for visiting.
315 // Internal nodes represent individual strings, which may repeat.
316 for (auto &ChildPair : Curr->Children)
317 // Save all of this node's children for processing.
318 if (auto *InternalChild =
319 dyn_cast<SuffixTreeInternalNode>(ChildPair.second))
320 InternalNodesToVisit.push_back(InternalChild);
321
322 // If length of repeated substring is below threshold, then skip it.
323 if (Length < MinLength)
324 continue;
325
326 // The root never represents a repeated substring. If we're looking at
327 // that, then skip it.
328 if (Curr->isRoot())
329 continue;
330
331 // Collect leaf children or leaf descendants by OutlinerLeafDescendants.
332 if (OutlinerLeafDescendants) {
333 for (unsigned I = Curr->getLeftLeafIdx(); I <= Curr->getRightLeafIdx();
334 ++I)
335 RepeatedSubstringStarts.push_back(LeafNodes[I]->getSuffixIdx());
336 } else {
337 for (auto &ChildPair : Curr->Children)
338 if (auto *Leaf = dyn_cast<SuffixTreeLeafNode>(ChildPair.second))
339 RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
340 }
341
342 // Do we have any repeated substrings?
343 if (RepeatedSubstringStarts.size() < 2)
344 continue;
345
346 // Yes. Update the state to reflect this, and then bail out.
347 N = Curr;
348 RS.Length = Length;
349 llvm::append_range(RS.StartIndices, RepeatedSubstringStarts);
350 break;
351 }
352 // At this point, either NewRS is an empty RepeatedSubstring, or it was
353 // set in the above loop. Similarly, N is either nullptr, or the node
354 // associated with NewRS.
355}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the BumpPtrAllocator interface.
#define I(x, y, z)
Definition MD5.cpp:58
static size_t numElementsInSubstring(const SuffixTreeNode *N)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
void push_back(const T &Elt)
LLVM_ABI SuffixTree(const ArrayRef< unsigned > &Str, bool OutlinerLeafDescendants=false)
Construct a suffix tree from a sequence of unsigned integers.
ArrayRef< unsigned > Str
Each element is an integer representing an instruction in the module.
Definition SuffixTree.h:44
bool OutlinerLeafDescendants
Whether to consider leaf descendants or only leaf children.
Definition SuffixTree.h:47
This is an optimization pass for GlobalISel generic memory operations.
@ Length
Definition DWP.cpp:477
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
#define N
DenseMap< unsigned, SuffixTreeNode * > Children
The children of this node.
void setLink(SuffixTreeInternalNode *L)
Sets Link to L. Assumes L is not null.
A node in a suffix tree which represents a substring or suffix.
LLVM_ABI void incrementStartIdx(unsigned Inc)
Advance this node's StartIdx by Inc.
LLVM_ABI void setConcatLen(unsigned Len)
Set the length of the string from the root to this node to Len.
LLVM_ABI unsigned getLeftLeafIdx() const
LLVM_ABI unsigned getRightLeafIdx() const
LLVM_ABI void setRightLeafIdx(unsigned Idx)
Set the index of the right most leaf node of this node to Idx.
static const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
LLVM_ABI unsigned getStartIdx() const
LLVM_ABI void setLeftLeafIdx(unsigned Idx)
Set the index of the left most leaf node of this node to Idx.