LLVM  13.0.0git
SuffixTree.cpp
Go to the documentation of this file.
1 //===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Suffix Tree class.
10 //
11 //===----------------------------------------------------------------------===//
12 
14 #include "llvm/Support/Allocator.h"
15 #include <vector>
16 
17 using namespace llvm;
18 
19 SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
20  Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
21  Active.Node = Root;
22 
23  // Keep track of the number of suffixes we have to add of the current
24  // prefix.
25  unsigned SuffixesToAdd = 0;
26 
27  // Construct the suffix tree iteratively on each prefix of the string.
28  // PfxEndIdx is the end index of the current prefix.
29  // End is one past the last element in the string.
30  for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
31  SuffixesToAdd++;
32  LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
33  SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
34  }
35 
36  // Set the suffix indices of each leaf.
37  assert(Root && "Root node can't be nullptr!");
38  setSuffixIndices();
39 }
40 
41 SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
42  unsigned StartIdx, unsigned Edge) {
43 
44  assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
45 
46  SuffixTreeNode *N = new (NodeAllocator.Allocate())
47  SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
48  Parent.Children[Edge] = N;
49 
50  return N;
51 }
52 
53 SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
54  unsigned StartIdx,
55  unsigned EndIdx, unsigned Edge) {
56 
57  assert(StartIdx <= EndIdx && "String can't start after it ends!");
58  assert(!(!Parent && StartIdx != EmptyIdx) &&
59  "Non-root internal nodes must have parents!");
60 
61  unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
62  SuffixTreeNode *N =
63  new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
64  if (Parent)
65  Parent->Children[Edge] = N;
66 
67  return N;
68 }
69 
70 void SuffixTree::setSuffixIndices() {
71  // List of nodes we need to visit along with the current length of the
72  // string.
73  std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
74 
75  // Current node being visited.
76  SuffixTreeNode *CurrNode = Root;
77 
78  // Sum of the lengths of the nodes down the path to the current one.
79  unsigned CurrNodeLen = 0;
80  ToVisit.push_back({CurrNode, CurrNodeLen});
81  while (!ToVisit.empty()) {
82  std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
83  ToVisit.pop_back();
84  CurrNode->ConcatLen = CurrNodeLen;
85  for (auto &ChildPair : CurrNode->Children) {
86  assert(ChildPair.second && "Node had a null child!");
87  ToVisit.push_back(
88  {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
89  }
90 
91  // No children, so we are at the end of the string.
92  if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
93  CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
94  }
95 }
96 
97 unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
98  SuffixTreeNode *NeedsLink = nullptr;
99 
100  while (SuffixesToAdd > 0) {
101 
102  // Are we waiting to add anything other than just the last character?
103  if (Active.Len == 0) {
104  // If not, then say the active index is the end index.
105  Active.Idx = EndIdx;
106  }
107 
108  assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
109 
110  // The first character in the current substring we're looking at.
111  unsigned FirstChar = Str[Active.Idx];
112 
113  // Have we inserted anything starting with FirstChar at the current node?
114  if (Active.Node->Children.count(FirstChar) == 0) {
115  // If not, then we can just insert a leaf and move to the next step.
116  insertLeaf(*Active.Node, EndIdx, FirstChar);
117 
118  // The active node is an internal node, and we visited it, so it must
119  // need a link if it doesn't have one.
120  if (NeedsLink) {
121  NeedsLink->Link = Active.Node;
122  NeedsLink = nullptr;
123  }
124  } else {
125  // There's a match with FirstChar, so look for the point in the tree to
126  // insert a new node.
127  SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
128 
129  unsigned SubstringLen = NextNode->size();
130 
131  // Is the current suffix we're trying to insert longer than the size of
132  // the child we want to move to?
133  if (Active.Len >= SubstringLen) {
134  // If yes, then consume the characters we've seen and move to the next
135  // node.
136  Active.Idx += SubstringLen;
137  Active.Len -= SubstringLen;
138  Active.Node = NextNode;
139  continue;
140  }
141 
142  // Otherwise, the suffix we're trying to insert must be contained in the
143  // next node we want to move to.
144  unsigned LastChar = Str[EndIdx];
145 
146  // Is the string we're trying to insert a substring of the next node?
147  if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
148  // If yes, then we're done for this step. Remember our insertion point
149  // and move to the next end index. At this point, we have an implicit
150  // suffix tree.
151  if (NeedsLink && !Active.Node->isRoot()) {
152  NeedsLink->Link = Active.Node;
153  NeedsLink = nullptr;
154  }
155 
156  Active.Len++;
157  break;
158  }
159 
160  // The string we're trying to insert isn't a substring of the next node,
161  // but matches up to a point. Split the node.
162  //
163  // For example, say we ended our search at a node n and we're trying to
164  // insert ABD. Then we'll create a new node s for AB, reduce n to just
165  // representing C, and insert a new leaf node l to represent d. This
166  // allows us to ensure that if n was a leaf, it remains a leaf.
167  //
168  // | ABC ---split---> | AB
169  // n s
170  // C / \ D
171  // n l
172 
173  // The node s from the diagram
174  SuffixTreeNode *SplitNode =
175  insertInternalNode(Active.Node, NextNode->StartIdx,
176  NextNode->StartIdx + Active.Len - 1, FirstChar);
177 
178  // Insert the new node representing the new substring into the tree as
179  // a child of the split node. This is the node l from the diagram.
180  insertLeaf(*SplitNode, EndIdx, LastChar);
181 
182  // Make the old node a child of the split node and update its start
183  // index. This is the node n from the diagram.
184  NextNode->StartIdx += Active.Len;
185  SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
186 
187  // SplitNode is an internal node, update the suffix link.
188  if (NeedsLink)
189  NeedsLink->Link = SplitNode;
190 
191  NeedsLink = SplitNode;
192  }
193 
194  // We've added something new to the tree, so there's one less suffix to
195  // add.
196  SuffixesToAdd--;
197 
198  if (Active.Node->isRoot()) {
199  if (Active.Len > 0) {
200  Active.Len--;
201  Active.Idx = EndIdx - SuffixesToAdd + 1;
202  }
203  } else {
204  // Start the next phase at the next smallest suffix.
205  Active.Node = Active.Node->Link;
206  }
207  }
208 
209  return SuffixesToAdd;
210 }
llvm
Definition: AllocatorList.h:23
Allocator.h
llvm::SuffixTree::SuffixTree
SuffixTree(const std::vector< unsigned > &Str)
Construct a suffix tree from a sequence of unsigned integers.
Definition: SuffixTree.cpp:19
llvm::EmptyIdx
const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
Definition: SuffixTree.h:23
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::SuffixTreeNode::Link
SuffixTreeNode * Link
For internal nodes, a pointer to the internal node representing the same sequence with the first char...
Definition: SuffixTree.h:83
llvm::SuffixTree::Str
llvm::ArrayRef< unsigned > Str
Each element is an integer representing an instruction in the module.
Definition: SuffixTree.h:140
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::SuffixTreeNode::SuffixIdx
unsigned SuffixIdx
For leaves, the start index of the suffix represented by this node.
Definition: SuffixTree.h:63
SuffixTree.h
llvm::SuffixTreeNode::Children
llvm::DenseMap< unsigned, SuffixTreeNode * > Children
The children of this node.
Definition: SuffixTree.h:47
llvm::SuffixTreeNode
A node in a suffix tree which represents a substring or suffix.
Definition: SuffixTree.h:40
llvm::SuffixTreeNode::isRoot
bool isRoot() const
Returns true if this node is the root of its owning SuffixTree.
Definition: SuffixTree.h:93
llvm::SuffixTreeNode::StartIdx
unsigned StartIdx
The start index of this node's substring in the main string.
Definition: SuffixTree.h:50
llvm::SuffixTreeNode::ConcatLen
unsigned ConcatLen
The length of the string formed by concatenating the edge labels from the root to this node.
Definition: SuffixTree.h:87
llvm::SuffixTreeNode::size
size_t size() const
Return the number of elements in the substring associated with this node.
Definition: SuffixTree.h:96
N
#define N
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163