LLVM 19.0.0git
SuffixTreeNode.h
Go to the documentation of this file.
1//===- llvm/ADT/SuffixTreeNode.h - Nodes for SuffixTrees --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines nodes for use within a SuffixTree.
10//
11// Each node has either no children or at least two children, with the root
12// being a exception in the empty tree.
13//
14// Children are represented as a map between unsigned integers and nodes. If
15// a node N has a child M on unsigned integer k, then the mapping represented
16// by N is a proper prefix of the mapping represented by M. Note that this,
17// although similar to a trie is somewhat different: each node stores a full
18// substring of the full mapping rather than a single character state.
19//
20// Each internal node contains a pointer to the internal node representing
21// the same string, but with the first character chopped off. This is stored
22// in \p Link. Each leaf node stores the start index of its respective
23// suffix in \p SuffixIdx.
24//===----------------------------------------------------------------------===//
25
26#ifndef LLVM_SUPPORT_SUFFIXTREE_NODE_H
27#define LLVM_SUPPORT_SUFFIXTREE_NODE_H
28#include "llvm/ADT/DenseMap.h"
29
30namespace llvm {
31
32/// A node in a suffix tree which represents a substring or suffix.
34public:
35 /// Represents an undefined index in the suffix tree.
36 static const unsigned EmptyIdx = -1;
37 enum class NodeKind { ST_Leaf, ST_Internal };
38
39private:
40 const NodeKind Kind;
41
42 /// The start index of this node's substring in the main string.
43 unsigned StartIdx = EmptyIdx;
44
45 /// The length of the string formed by concatenating the edge labels from
46 /// the root to this node.
47 unsigned ConcatLen = 0;
48
49public:
50 // LLVM RTTI boilerplate.
51 NodeKind getKind() const { return Kind; }
52
53 /// \return the start index of this node's substring in the entire string.
54 unsigned getStartIdx() const;
55
56 /// \returns the end index of this node.
57 virtual unsigned getEndIdx() const = 0;
58
59 /// Advance this node's StartIdx by \p Inc.
60 void incrementStartIdx(unsigned Inc);
61
62 /// Set the length of the string from the root to this node to \p Len.
63 void setConcatLen(unsigned Len);
64
65 /// \returns the length of the string from the root to this node.
66 unsigned getConcatLen() const;
67
68 SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
69 : Kind(Kind), StartIdx(StartIdx) {}
70 virtual ~SuffixTreeNode() = default;
71};
72
73// A node with two or more children, or the root.
75private:
76 /// The end index of this node's substring in the main string.
77 ///
78 /// Every leaf node must have its \p EndIdx incremented at the end of every
79 /// step in the construction algorithm. To avoid having to update O(N)
80 /// nodes individually at the end of every step, the end index is stored
81 /// as a pointer.
82 unsigned EndIdx = EmptyIdx;
83
84 /// A pointer to the internal node representing the same sequence with the
85 /// first character chopped off.
86 ///
87 /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
88 /// Ukkonen's algorithm does to achieve linear-time construction is
89 /// keep track of which node the next insert should be at. This makes each
90 /// insert O(1), and there are a total of O(N) inserts. The suffix link
91 /// helps with inserting children of internal nodes.
92 ///
93 /// Say we add a child to an internal node with associated mapping S. The
94 /// next insertion must be at the node representing S - its first character.
95 /// This is given by the way that we iteratively build the tree in Ukkonen's
96 /// algorithm. The main idea is to look at the suffixes of each prefix in the
97 /// string, starting with the longest suffix of the prefix, and ending with
98 /// the shortest. Therefore, if we keep pointers between such nodes, we can
99 /// move to the next insertion point in O(1) time. If we don't, then we'd
100 /// have to query from the root, which takes O(N) time. This would make the
101 /// construction algorithm O(N^2) rather than O(N).
102 SuffixTreeInternalNode *Link = nullptr;
103
104public:
105 // LLVM RTTI boilerplate.
106 static bool classof(const SuffixTreeNode *N) {
107 return N->getKind() == NodeKind::ST_Internal;
108 }
109
110 /// \returns true if this node is the root of its owning \p SuffixTree.
111 bool isRoot() const;
112
113 /// \returns the end index of this node's substring in the entire string.
114 unsigned getEndIdx() const override;
115
116 /// Sets \p Link to \p L. Assumes \p L is not null.
118
119 /// \returns the pointer to the Link node.
121
122 /// The children of this node.
123 ///
124 /// A child existing on an unsigned integer implies that from the mapping
125 /// represented by the current node, there is a way to reach another
126 /// mapping by tacking that character on the end of the current string.
128
129 SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx,
131 : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
132 Link(Link) {}
133
134 virtual ~SuffixTreeInternalNode() = default;
135};
136
137// A node representing a suffix.
139private:
140 /// The start index of the suffix represented by this leaf.
141 unsigned SuffixIdx = EmptyIdx;
142
143 /// The end index of this node's substring in the main string.
144 ///
145 /// Every leaf node must have its \p EndIdx incremented at the end of every
146 /// step in the construction algorithm. To avoid having to update O(N)
147 /// nodes individually at the end of every step, the end index is stored
148 /// as a pointer.
149 unsigned *EndIdx = nullptr;
150
151public:
152 // LLVM RTTI boilerplate.
153 static bool classof(const SuffixTreeNode *N) {
154 return N->getKind() == NodeKind::ST_Leaf;
155 }
156
157 /// \returns the end index of this node's substring in the entire string.
158 unsigned getEndIdx() const override;
159
160 /// \returns the start index of the suffix represented by this leaf.
161 unsigned getSuffixIdx() const;
162
163 /// Sets the start index of the suffix represented by this leaf to \p Idx.
164 void setSuffixIdx(unsigned Idx);
165 SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
166 : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
167
168 virtual ~SuffixTreeLeafNode() = default;
169};
170} // namespace llvm
171#endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
#define N
Determine the kind of a node from its type.
DenseMap< unsigned, SuffixTreeNode * > Children
The children of this node.
virtual ~SuffixTreeInternalNode()=default
static bool classof(const SuffixTreeNode *N)
SuffixTreeInternalNode * getLink() const
unsigned getEndIdx() const override
void setLink(SuffixTreeInternalNode *L)
Sets Link to L. Assumes L is not null.
SuffixTreeInternalNode(unsigned StartIdx, unsigned EndIdx, SuffixTreeInternalNode *Link)
virtual ~SuffixTreeLeafNode()=default
void setSuffixIdx(unsigned Idx)
Sets the start index of the suffix represented by this leaf to Idx.
unsigned getEndIdx() const override
static bool classof(const SuffixTreeNode *N)
unsigned getSuffixIdx() const
SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
A node in a suffix tree which represents a substring or suffix.
void incrementStartIdx(unsigned Inc)
Advance this node's StartIdx by Inc.
NodeKind getKind() const
void setConcatLen(unsigned Len)
Set the length of the string from the root to this node to Len.
virtual ~SuffixTreeNode()=default
SuffixTreeNode(NodeKind Kind, unsigned StartIdx)
unsigned getConcatLen() const
static const unsigned EmptyIdx
Represents an undefined index in the suffix tree.
unsigned getStartIdx() const
virtual unsigned getEndIdx() const =0