LLVM 19.0.0git
SuffixTree.cpp
Go to the documentation of this file.
1//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Suffix Tree class.
10//
11//===----------------------------------------------------------------------===//
12
17
18using namespace llvm;
19
20/// \returns the number of elements in the substring associated with \p N.
22 assert(N && "Got a null node?");
23 if (auto *Internal = dyn_cast<SuffixTreeInternalNode>(N))
24 if (Internal->isRoot())
25 return 0;
26 return N->getEndIdx() - N->getStartIdx() + 1;
27}
28
30 Root = insertRoot();
31 Active.Node = Root;
32
33 // Keep track of the number of suffixes we have to add of the current
34 // prefix.
35 unsigned SuffixesToAdd = 0;
36
37 // Construct the suffix tree iteratively on each prefix of the string.
38 // PfxEndIdx is the end index of the current prefix.
39 // End is one past the last element in the string.
40 for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
41 SuffixesToAdd++;
42 LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
43 SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
44 }
45
46 // Set the suffix indices of each leaf.
47 assert(Root && "Root node can't be nullptr!");
48 setSuffixIndices();
49}
50
51SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeInternalNode &Parent,
52 unsigned StartIdx, unsigned Edge) {
53 assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
54 auto *N = new (LeafNodeAllocator.Allocate())
55 SuffixTreeLeafNode(StartIdx, &LeafEndIdx);
56 Parent.Children[Edge] = N;
57 return N;
58}
59
61SuffixTree::insertInternalNode(SuffixTreeInternalNode *Parent,
62 unsigned StartIdx, unsigned EndIdx,
63 unsigned Edge) {
64 assert(StartIdx <= EndIdx && "String can't start after it ends!");
65 assert(!(!Parent && StartIdx != SuffixTreeNode::EmptyIdx) &&
66 "Non-root internal nodes must have parents!");
67 auto *N = new (InternalNodeAllocator.Allocate())
68 SuffixTreeInternalNode(StartIdx, EndIdx, Root);
69 if (Parent)
70 Parent->Children[Edge] = N;
71 return N;
72}
73
74SuffixTreeInternalNode *SuffixTree::insertRoot() {
75 return insertInternalNode(/*Parent = */ nullptr, SuffixTreeNode::EmptyIdx,
76 SuffixTreeNode::EmptyIdx, /*Edge = */ 0);
77}
78
79void SuffixTree::setSuffixIndices() {
80 // List of nodes we need to visit along with the current length of the
81 // string.
83
84 // Current node being visited.
85 SuffixTreeNode *CurrNode = Root;
86
87 // Sum of the lengths of the nodes down the path to the current one.
88 unsigned CurrNodeLen = 0;
89 ToVisit.push_back({CurrNode, CurrNodeLen});
90 while (!ToVisit.empty()) {
91 std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
92 ToVisit.pop_back();
93 // Length of the current node from the root down to here.
94 CurrNode->setConcatLen(CurrNodeLen);
95 if (auto *InternalNode = dyn_cast<SuffixTreeInternalNode>(CurrNode))
96 for (auto &ChildPair : InternalNode->Children) {
97 assert(ChildPair.second && "Node had a null child!");
98 ToVisit.push_back(
99 {ChildPair.second,
100 CurrNodeLen + numElementsInSubstring(ChildPair.second)});
101 }
102 // No children, so we are at the end of the string.
103 if (auto *LeafNode = dyn_cast<SuffixTreeLeafNode>(CurrNode))
104 LeafNode->setSuffixIdx(Str.size() - CurrNodeLen);
105 }
106}
107
108unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
109 SuffixTreeInternalNode *NeedsLink = nullptr;
110
111 while (SuffixesToAdd > 0) {
112
113 // Are we waiting to add anything other than just the last character?
114 if (Active.Len == 0) {
115 // If not, then say the active index is the end index.
116 Active.Idx = EndIdx;
117 }
118
119 assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
120
121 // The first character in the current substring we're looking at.
122 unsigned FirstChar = Str[Active.Idx];
123
124 // Have we inserted anything starting with FirstChar at the current node?
125 if (Active.Node->Children.count(FirstChar) == 0) {
126 // If not, then we can just insert a leaf and move to the next step.
127 insertLeaf(*Active.Node, EndIdx, FirstChar);
128
129 // The active node is an internal node, and we visited it, so it must
130 // need a link if it doesn't have one.
131 if (NeedsLink) {
132 NeedsLink->setLink(Active.Node);
133 NeedsLink = nullptr;
134 }
135 } else {
136 // There's a match with FirstChar, so look for the point in the tree to
137 // insert a new node.
138 SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
139
140 unsigned SubstringLen = numElementsInSubstring(NextNode);
141
142 // Is the current suffix we're trying to insert longer than the size of
143 // the child we want to move to?
144 if (Active.Len >= SubstringLen) {
145 // If yes, then consume the characters we've seen and move to the next
146 // node.
147 assert(isa<SuffixTreeInternalNode>(NextNode) &&
148 "Expected an internal node?");
149 Active.Idx += SubstringLen;
150 Active.Len -= SubstringLen;
151 Active.Node = cast<SuffixTreeInternalNode>(NextNode);
152 continue;
153 }
154
155 // Otherwise, the suffix we're trying to insert must be contained in the
156 // next node we want to move to.
157 unsigned LastChar = Str[EndIdx];
158
159 // Is the string we're trying to insert a substring of the next node?
160 if (Str[NextNode->getStartIdx() + Active.Len] == LastChar) {
161 // If yes, then we're done for this step. Remember our insertion point
162 // and move to the next end index. At this point, we have an implicit
163 // suffix tree.
164 if (NeedsLink && !Active.Node->isRoot()) {
165 NeedsLink->setLink(Active.Node);
166 NeedsLink = nullptr;
167 }
168
169 Active.Len++;
170 break;
171 }
172
173 // The string we're trying to insert isn't a substring of the next node,
174 // but matches up to a point. Split the node.
175 //
176 // For example, say we ended our search at a node n and we're trying to
177 // insert ABD. Then we'll create a new node s for AB, reduce n to just
178 // representing C, and insert a new leaf node l to represent d. This
179 // allows us to ensure that if n was a leaf, it remains a leaf.
180 //
181 // | ABC ---split---> | AB
182 // n s
183 // C / \ D
184 // n l
185
186 // The node s from the diagram
187 SuffixTreeInternalNode *SplitNode = insertInternalNode(
188 Active.Node, NextNode->getStartIdx(),
189 NextNode->getStartIdx() + Active.Len - 1, FirstChar);
190
191 // Insert the new node representing the new substring into the tree as
192 // a child of the split node. This is the node l from the diagram.
193 insertLeaf(*SplitNode, EndIdx, LastChar);
194
195 // Make the old node a child of the split node and update its start
196 // index. This is the node n from the diagram.
197 NextNode->incrementStartIdx(Active.Len);
198 SplitNode->Children[Str[NextNode->getStartIdx()]] = NextNode;
199
200 // SplitNode is an internal node, update the suffix link.
201 if (NeedsLink)
202 NeedsLink->setLink(SplitNode);
203
204 NeedsLink = SplitNode;
205 }
206
207 // We've added something new to the tree, so there's one less suffix to
208 // add.
209 SuffixesToAdd--;
210
211 if (Active.Node->isRoot()) {
212 if (Active.Len > 0) {
213 Active.Len--;
214 Active.Idx = EndIdx - SuffixesToAdd + 1;
215 }
216 } else {
217 // Start the next phase at the next smallest suffix.
218 Active.Node = Active.Node->getLink();
219 }
220 }
221
222 return SuffixesToAdd;
223}
224
225void SuffixTree::RepeatedSubstringIterator::advance() {
226 // Clear the current state. If we're at the end of the range, then this
227 // is the state we want to be in.
228 RS = RepeatedSubstring();
229 N = nullptr;
230
231 // Each leaf node represents a repeat of a string.
232 SmallVector<unsigned> RepeatedSubstringStarts;
233
234 // Continue visiting nodes until we find one which repeats more than once.
235 while (!InternalNodesToVisit.empty()) {
236 RepeatedSubstringStarts.clear();
237 auto *Curr = InternalNodesToVisit.back();
238 InternalNodesToVisit.pop_back();
239
240 // Keep track of the length of the string associated with the node. If
241 // it's too short, we'll quit.
242 unsigned Length = Curr->getConcatLen();
243
244 // Iterate over each child, saving internal nodes for visiting, and
245 // leaf nodes in LeafChildren. Internal nodes represent individual
246 // strings, which may repeat.
247 for (auto &ChildPair : Curr->Children) {
248 // Save all of this node's children for processing.
249 if (auto *InternalChild =
250 dyn_cast<SuffixTreeInternalNode>(ChildPair.second)) {
251 InternalNodesToVisit.push_back(InternalChild);
252 continue;
253 }
254
255 if (Length < MinLength)
256 continue;
257
258 // Have an occurrence of a potentially repeated string. Save it.
259 auto *Leaf = cast<SuffixTreeLeafNode>(ChildPair.second);
260 RepeatedSubstringStarts.push_back(Leaf->getSuffixIdx());
261 }
262
263 // The root never represents a repeated substring. If we're looking at
264 // that, then skip it.
265 if (Curr->isRoot())
266 continue;
267
268 // Do we have any repeated substrings?
269 if (RepeatedSubstringStarts.size() < 2)
270 continue;
271
272 // Yes. Update the state to reflect this, and then bail out.
273 N = Curr;
274 RS.Length = Length;
275 for (unsigned StartIdx : RepeatedSubstringStarts)
276 RS.StartIndices.push_back(StartIdx);
277 break;
278 }
279 // At this point, either NewRS is an empty RepeatedSubstring, or it was
280 // set in the above loop. Similarly, N is either nullptr, or the node
281 // associated with NewRS.
282}
This file defines the BumpPtrAllocator interface.
bool End
Definition: ELF_riscv.cpp:480
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static size_t numElementsInSubstring(const SuffixTreeNode *N)
Definition: SuffixTree.cpp:21
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
SuffixTree(const ArrayRef< unsigned > &Str)
Construct a suffix tree from a sequence of unsigned integers.
Definition: SuffixTree.cpp:29
ArrayRef< unsigned > Str
Each element is an integer representing an instruction in the module.
Definition: SuffixTree.h:43
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:456
#define N
DenseMap< unsigned, SuffixTreeNode * > Children
The children of this node.
void setLink(SuffixTreeInternalNode *L)
Sets Link to L. Assumes L is not null.
A node in a suffix tree which represents a substring or suffix.
void incrementStartIdx(unsigned Inc)
Advance this node's StartIdx by Inc.
void setConcatLen(unsigned Len)
Set the length of the string from the root to this node to Len.
static const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
unsigned getStartIdx() const
SmallVector< unsigned > StartIndices
The start indices of each occurrence.
Definition: SuffixTree.h:51
unsigned Length
The length of the string.
Definition: SuffixTree.h:48