LLVM 22.0.0git
OnDiskTrieRawHashMap.h
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file declares interface for OnDiskTrieRawHashMap, a thread-safe and
11/// (mostly) lock-free hash map stored as trie and backed by persistent files on
12/// disk.
13///
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CAS_ONDISKTRIERAWHASHMAP_H
17#define LLVM_CAS_ONDISKTRIERAWHASHMAP_H
18
19#include "llvm/ADT/ArrayRef.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/StringRef.h"
23#include "llvm/CAS/FileOffset.h"
24#include "llvm/Support/Error.h"
25#include <optional>
26
27namespace llvm {
28
29class raw_ostream;
30
31namespace cas {
32
33/// OnDiskTrieRawHashMap is a persistent trie data structure used as hash maps.
34/// The keys are fixed length, and are expected to be binary hashes with a
35/// normal distribution.
36///
37/// - Thread-safety is achieved through the use of atomics within a shared
38/// memory mapping. Atomic access does not work on networked filesystems.
39/// - Filesystem locks are used, but only sparingly:
40/// - during initialization, for creating / opening an existing store;
41/// - for the lifetime of the instance, a shared/reader lock is held
42/// - during destruction, if there are no concurrent readers, to shrink the
43/// files to their minimum size.
44/// - Path is used as a directory:
45/// - "index" stores the root trie and subtries.
46/// - "data" stores (most of) the entries, like a bump-ptr-allocator.
47/// - Large entries are stored externally in a file named by the key.
48/// - Code is system-dependent and binary format itself is not portable. These
49/// are not artifacts that can/should be moved between different systems; they
50/// are only appropriate for local storage.
52public:
53 LLVM_DUMP_METHOD void dump() const;
54 void
56 function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
57
58public:
59 /// Const value proxy to access the records stored in TrieRawHashMap.
70
71 /// Value proxy to access the records stored in TrieRawHashMap.
82
83 /// Validate the trie data structure.
84 ///
85 /// Callback receives the file offset to the data entry and the data stored.
87 function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const;
88
89 /// Check the valid range of file offset for OnDiskTrieRawHashMap.
91 return Offset.get() < (1LL << 48);
92 }
93
94public:
95 /// Template class to implement a `pointer` type into the trie data structure.
96 ///
97 /// It provides pointer-like operation, e.g., dereference to get underlying
98 /// data. It also reserves the top 16 bits of the pointer value, which can be
99 /// used to pack additional information if needed.
100 template <class ProxyT> class PointerImpl {
101 public:
104 }
105
106 explicit operator bool() const { return IsValue; }
107
108 const ProxyT &operator*() const {
110 return Value;
111 }
112 const ProxyT *operator->() const {
114 return &Value;
115 }
116
117 PointerImpl() = default;
118
119 protected:
123 if (IsValue)
125 }
126
127 ProxyT Value;
130
131 // True if points to a value (not a "nullptr"). Use an extra field because
132 // 0 can be a valid offset.
133 bool IsValue = false;
134 };
135
136 class OnDiskPtr;
137 class ConstOnDiskPtr : public PointerImpl<ConstValueProxy> {
138 public:
139 ConstOnDiskPtr() = default;
140
141 private:
142 friend class OnDiskPtr;
144 using ConstOnDiskPtr::PointerImpl::PointerImpl;
145 };
146
147 class OnDiskPtr : public PointerImpl<ValueProxy> {
148 public:
149 operator ConstOnDiskPtr() const {
151 }
152
153 OnDiskPtr() = default;
154
155 private:
157 using OnDiskPtr::PointerImpl::PointerImpl;
158 };
159
160 /// Find the value from hash.
161 ///
162 /// \returns pointer to the value if exists, otherwise returns a non-value
163 /// pointer that evaluates to `false` when convert to boolean.
165
166 /// Helper function to recover a pointer into the trie from file offset.
169
171 function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue)>;
173 function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue,
174 FileOffset FinalOffset, ValueProxy FinalValue)>;
175
176 /// Insert lazily.
177 ///
178 /// \p OnConstruct is called when ready to insert a value, after allocating
179 /// space for the data. It is called at most once.
180 ///
181 /// \p OnLeak is called only if \p OnConstruct has been called and a race
182 /// occurred before insertion, causing the tentative offset and data to be
183 /// abandoned. This allows clients to clean up other results or update any
184 /// references.
185 ///
186 /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success.
187 /// The in-memory \a TrieRawHashMap uses LazyAtomicPointer to synchronize
188 /// simultaneous writes, but that seems dangerous to use in a memory-mapped
189 /// file in case a process crashes in the busy state.
192 LazyInsertOnConstructCB OnConstruct = nullptr,
193 LazyInsertOnLeakCB OnLeak = nullptr);
194
196 return insertLazy(Value.Hash, [&](FileOffset, ValueProxy Allocated) {
197 assert(Allocated.Hash == Value.Hash);
198 assert(Allocated.Data.size() == Value.Data.size());
199 llvm::copy(Value.Data, Allocated.Data.begin());
200 });
201 }
202
203 LLVM_ABI_FOR_TEST size_t size() const;
204 LLVM_ABI_FOR_TEST size_t capacity() const;
205
206 /// Gets or creates a file at \p Path with a hash-mapped trie named \p
207 /// TrieName. The hash size is \p NumHashBits (in bits) and the records store
208 /// data of size \p DataSize (in bytes).
209 ///
210 /// \p MaxFileSize controls the maximum file size to support, limiting the
211 /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting
212 /// size if a new file is created.
213 ///
214 /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to
215 /// configure the trie, if it doesn't already exist.
216 ///
217 /// \pre NumHashBits is a multiple of 8 (byte-aligned).
219 create(const Twine &Path, const Twine &TrieName, size_t NumHashBits,
220 uint64_t DataSize, uint64_t MaxFileSize,
221 std::optional<uint64_t> NewFileInitialSize,
222 std::optional<size_t> NewTableNumRootBits = std::nullopt,
223 std::optional<size_t> NewTableNumSubtrieBits = std::nullopt);
224
228
229private:
230 struct ImplType;
231 explicit OnDiskTrieRawHashMap(std::unique_ptr<ImplType> Impl);
232 std::unique_ptr<ImplType> Impl;
233};
234
235} // namespace cas
236} // namespace llvm
237
238#endif // LLVM_CAS_ONDISKTRIERAWHASHMAP_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
#define LLVM_ABI_FOR_TEST
Definition Compiler.h:218
This file declares interface for FileOffset that represent stored data at an offset from the beginnin...
This file contains some templates that are useful if you are working with the STL at all.
Value * RHS
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
Tagged union holding either a T or a Error.
Definition Error.h:485
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM Value Representation.
Definition Value.h:75
FileOffset is a wrapper around uint64_t to represent the offset of data from the beginning of the fil...
Definition FileOffset.h:24
PointerImpl(ProxyT Value, FileOffset Offset, bool IsValue=true)
Expected< OnDiskPtr > insert(const ConstValueProxy &Value)
LLVM_ABI_FOR_TEST OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS)
function_ref< void(FileOffset TentativeOffset, ValueProxy TentativeValue)> LazyInsertOnConstructCB
LLVM_ABI_FOR_TEST Error validate(function_ref< Error(FileOffset, ConstValueProxy)> RecordVerifier) const
Validate the trie data structure.
static LLVM_ABI_FOR_TEST Expected< OnDiskTrieRawHashMap > create(const Twine &Path, const Twine &TrieName, size_t NumHashBits, uint64_t DataSize, uint64_t MaxFileSize, std::optional< uint64_t > NewFileInitialSize, std::optional< size_t > NewTableNumRootBits=std::nullopt, std::optional< size_t > NewTableNumSubtrieBits=std::nullopt)
Gets or creates a file at Path with a hash-mapped trie named TrieName.
LLVM_ABI_FOR_TEST ConstOnDiskPtr find(ArrayRef< uint8_t > Hash) const
Find the value from hash.
LLVM_DUMP_METHOD void dump() const
LLVM_ABI_FOR_TEST size_t size() const
static bool validOffset(FileOffset Offset)
Check the valid range of file offset for OnDiskTrieRawHashMap.
LLVM_ABI_FOR_TEST Expected< OnDiskPtr > insertLazy(ArrayRef< uint8_t > Hash, LazyInsertOnConstructCB OnConstruct=nullptr, LazyInsertOnLeakCB OnLeak=nullptr)
Insert lazily.
LLVM_ABI_FOR_TEST size_t capacity() const
LLVM_ABI_FOR_TEST Expected< ConstOnDiskPtr > recoverFromFileOffset(FileOffset Offset) const
Helper function to recover a pointer into the trie from file offset.
LLVM_ABI_FOR_TEST OnDiskTrieRawHashMap & operator=(OnDiskTrieRawHashMap &&RHS)
void print(raw_ostream &OS, function_ref< void(ArrayRef< char >)> PrintRecordData=nullptr) const
function_ref< void(FileOffset TentativeOffset, ValueProxy TentativeValue, FileOffset FinalOffset, ValueProxy FinalValue)> LazyInsertOnLeakCB
LLVM_ABI_FOR_TEST ~OnDiskTrieRawHashMap()
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:267
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
Const value proxy to access the records stored in TrieRawHashMap.
ConstValueProxy(ArrayRef< uint8_t > Hash, StringRef Data)
ConstValueProxy(ArrayRef< uint8_t > Hash, ArrayRef< char > Data)
Value proxy to access the records stored in TrieRawHashMap.
ValueProxy(ArrayRef< uint8_t > Hash, MutableArrayRef< char > Data)