LLVM 20.0.0git
SuffixTreeNode.h
Go to the documentation of this file.
1//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines nodes for use within a SuffixTree.
10//
11// Each node has either no children or at least two children, with the root
12// being a exception in the empty tree.
13//
14// Children are represented as a map between unsigned integers and nodes. If
15// a node N has a child M on unsigned integer k, then the mapping represented
16// by N is a proper prefix of the mapping represented by M. Note that this,
17// although similar to a trie is somewhat different: each node stores a full
18// substring of the full mapping rather than a single character state.
19//
20// Each internal node contains a pointer to the internal node representing
21// the same string, but with the first character chopped off. This is stored
22// in \p Link. Each leaf node stores the start index of its respective
23// suffix in \p SuffixIdx.
24//===----------------------------------------------------------------------===//
25
26#ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
27#define LLVM_SUPPORT_SUFFIXTREE_NODE_H
28#include "llvm/ADT/DenseMap.h"
29
30namespace llvm {
31
32/// A node in a suffix tree which represents a substring or suffix.
34public:
35 /// Represents an undefined index in the suffix tree.
36 static const unsigned EmptyIdx = -1;
37 enum class NodeKind { ST_Leaf, ST_Internal };
38
39private:
40 const NodeKind Kind;
41
42 /// The start index of this node's substring in the main string.
43 unsigned StartIdx = EmptyIdx;
44
45 /// The length of the string formed by concatenating the edge labels from
46 /// the root to this node.
47 unsigned ConcatLen = 0;
48
49 /// These two indices give a range of indices for its leaf descendants.
50 /// Imagine drawing a tree on paper and assigning a unique index to each leaf
51 /// node in monotonically increasing order from left to right. This way of
52 /// numbering the leaf nodes allows us to associate a continuous range of
53 /// indices with each internal node. For example, if a node has leaf
54 /// descendants with indices i, i+1, ..., j, then its LeftLeafIdx is i and
55 /// its RightLeafIdx is j. These indices are for LeafNodes in the SuffixTree
56 /// class, which is constructed using post-order depth-first traversal.
57 unsigned LeftLeafIdx = EmptyIdx;
58 unsigned RightLeafIdx = EmptyIdx;
59
60public:
61 // LLVM RTTI boilerplate.
62 NodeKind getKind() const { return Kind; }
63
64 /// \return the start index of this node's substring in the entire string.
65 unsigned getStartIdx() const;
66
67 /// \returns the end index of this node.
68 virtual unsigned getEndIdx() const = 0;
69
70 /// \return the index of this node's left most leaf node.
71 unsigned getLeftLeafIdx() const;
72
73 /// \return the index of this node's right most leaf node.
74 unsigned getRightLeafIdx() const;
75
76 /// Set the index of the left most leaf node of this node to \p Idx.
77 void setLeftLeafIdx(unsigned Idx);
78
79 /// Set the index of the right most leaf node of this node to \p Idx.
80 void setRightLeafIdx(unsigned Idx);
81
82 /// Advance this node's StartIdx by \p Inc.
83 void incrementStartIdx(unsigned Inc);
84
85 /// Set the length of the string from the root to this node to \p Len.
86 void setConcatLen(unsigned Len);
87
88 /// \returns the length of the string from the root to this node.
89 unsigned getConcatLen() const;
90
91 SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
92 : Kind(Kind), StartIdx(StartIdx) {}
93 virtual ~SuffixTreeNode() = default;
94};
95
96// A node with two or more children, or the root.
98private:
99 /// The end index of this node's substring in the main string.
100 ///
101 /// Every leaf node must have its \p EndIdx incremented at the end of every
102 /// step in the construction algorithm. To avoid having to update O(N)
103 /// nodes individually at the end of every step, the end index is stored
104 /// as a pointer.
105 unsigned EndIdx = EmptyIdx;
106
107 /// A pointer to the internal node representing the same sequence with the
108 /// first character chopped off.
109 ///
110 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
111 /// Ukkonen's algorithm does to achieve linear-time construction is
112 /// keep track of which node the next insert should be at. This makes each
113 /// insert O(1), and there are a total of O(N) inserts. The suffix link
114 /// helps with inserting children of internal nodes.
115 ///
116 /// Say we add a child to an internal node with associated mapping S. The
117 /// next insertion must be at the node representing S - its first character.
118 /// This is given by the way that we iteratively build the tree in Ukkonen's
119 /// algorithm. The main idea is to look at the suffixes of each prefix in the
120 /// string, starting with the longest suffix of the prefix, and ending with
121 /// the shortest. Therefore, if we keep pointers between such nodes, we can
122 /// move to the next insertion point in O(1) time. If we don't, then we'd
123 /// have to query from the root, which takes O(N) time. This would make the
124 /// construction algorithm O(N^2) rather than O(N).
125 SuffixTreeInternalNode *Link = nullptr;
126
127public:
128 // LLVM RTTI boilerplate.
129 static bool classof(const SuffixTreeNode *N) {
130 return N->getKind() == NodeKind::ST_Internal;
131 }
132
133 /// \returns true if this node is the root of its owning \p SuffixTree.
134 bool isRoot() const;
135
136 /// \returns the end index of this node's substring in the entire string.
137 unsigned getEndIdx() const override;
138
139 /// Sets \p Link to \p L. Assumes \p L is not null.
141
142 /// \returns the pointer to the Link node.
144
145 /// The children of this node.
146 ///
147 /// A child existing on an unsigned integer implies that from the mapping
148 /// represented by the current node, there is a way to reach another
149 /// mapping by tacking that character on the end of the current string.
151
152 SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
154 : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
155 Link(Link) {}
156
157 virtual ~SuffixTreeInternalNode() = default;
158};
159
160// A node representing a suffix.
162private:
163 /// The start index of the suffix represented by this leaf.
164 unsigned SuffixIdx = EmptyIdx;
165
166 /// The end index of this node's substring in the main string.
167 ///
168 /// Every leaf node must have its \p EndIdx incremented at the end of every
169 /// step in the construction algorithm. To avoid having to update O(N)
170 /// nodes individually at the end of every step, the end index is stored
171 /// as a pointer.
172 unsigned *EndIdx = nullptr;
173
174public:
175 // LLVM RTTI boilerplate.
176 static bool classof(const SuffixTreeNode *N) {
177 return N->getKind() == NodeKind::ST_Leaf;
178 }
179
180 /// \returns the end index of this node's substring in the entire string.
181 unsigned getEndIdx() const override;
182
183 /// \returns the start index of the suffix represented by this leaf.
184 unsigned getSuffixIdx() const;
185
186 /// Sets the start index of the suffix represented by this leaf to \p Idx.
187 void setSuffixIdx(unsigned Idx);
188 SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
189 : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
190
191 virtual ~SuffixTreeLeafNode() = default;
192};
193} // namespace llvm
194#endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
#define N
Determine the kind of a node from its type.
DenseMap< unsigned, SuffixTreeNode * > Children
The children of this node.
virtual ~SuffixTreeInternalNode()=default
static bool classof(const SuffixTreeNode *N)
SuffixTreeInternalNode * getLink() const
unsigned getEndIdx() const override
void setLink(SuffixTreeInternalNode *L)
Sets Link to L. Assumes L is not null.
SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, SuffixTreeInternalNode *Link)
virtual ~SuffixTreeLeafNode()=default
void setSuffixIdx(unsigned Idx)
Sets the start index of the suffix represented by this leaf to Idx.
unsigned getEndIdx() const override
static bool classof(const SuffixTreeNode *N)
unsigned getSuffixIdx() const
SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
A node in a suffix tree which represents a substring or suffix.
void incrementStartIdx(unsigned Inc)
Advance this node's StartIdx by Inc.
NodeKind getKind() const
void setConcatLen(unsigned Len)
Set the length of the string from the root to this node to Len.
unsigned getLeftLeafIdx() const
virtual ~SuffixTreeNode()=default
unsigned getRightLeafIdx() const
void setRightLeafIdx(unsigned Idx)
Set the index of the right most leaf node of this node to Idx.
SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
unsigned getConcatLen() const
static const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
unsigned getStartIdx() const
void setLeftLeafIdx(unsigned Idx)
Set the index of the left most leaf node of this node to Idx.
virtual unsigned getEndIdx() const =0