LLVM 23.0.0git
OnDiskTrieRawHashMap.h
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file declares interface for OnDiskTrieRawHashMap, a thread-safe and
11/// (mostly) lock-free hash map stored as trie and backed by persistent files on
12/// disk.
13///
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CAS_ONDISKTRIERAWHASHMAP_H
17#define LLVM_CAS_ONDISKTRIERAWHASHMAP_H
18
19#include "llvm/ADT/ArrayRef.h"
20#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/StringRef.h"
23#include "llvm/CAS/FileOffset.h"
24#include "llvm/Support/Error.h"
25#include <optional>
26
27namespace llvm {
28
29class raw_ostream;
30
31namespace cas {
32
33namespace ondisk {
34class OnDiskCASLogger;
35} // namespace ondisk
36/// OnDiskTrieRawHashMap is a persistent trie data structure used as hash maps.
37/// The keys are fixed length, and are expected to be binary hashes with a
38/// normal distribution.
39///
40/// - Thread-safety is achieved through the use of atomics within a shared
41/// memory mapping. Atomic access does not work on networked filesystems.
42/// - Filesystem locks are used, but only sparingly:
43/// - during initialization, for creating / opening an existing store;
44/// - for the lifetime of the instance, a shared/reader lock is held
45/// - during destruction, if there are no concurrent readers, to shrink the
46/// files to their minimum size.
47/// - Path is used as a directory:
48/// - "index" stores the root trie and subtries.
49/// - "data" stores (most of) the entries, like a bump-ptr-allocator.
50/// - Large entries are stored externally in a file named by the key.
51/// - Code is system-dependent and binary format itself is not portable. These
52/// are not artifacts that can/should be moved between different systems; they
53/// are only appropriate for local storage.
55public:
56 LLVM_DUMP_METHOD void dump() const;
57 void
59 function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
60
61public:
62 /// Const value proxy to access the records stored in TrieRawHashMap.
73
74 /// Value proxy to access the records stored in TrieRawHashMap.
85
86 /// Validate the trie data structure.
87 ///
88 /// Callback receives the file offset to the data entry and the data stored.
90 function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const;
91
92 /// Check the valid range of file offset for OnDiskTrieRawHashMap.
94 return Offset.get() < (1LL << 48);
95 }
96
97public:
98 /// Template class to implement a `pointer` type into the trie data structure.
99 ///
100 /// It provides pointer-like operation, e.g., dereference to get underlying
101 /// data. It also reserves the top 16 bits of the pointer value, which can be
102 /// used to pack additional information if needed.
103 template <class ProxyT> class PointerImpl {
104 public:
107 }
108
109 explicit operator bool() const { return IsValue; }
110
111 const ProxyT &operator*() const {
113 return Value;
114 }
115 const ProxyT *operator->() const {
117 return &Value;
118 }
119
120 PointerImpl() = default;
121
122 protected:
126 if (IsValue)
128 }
129
130 ProxyT Value;
133
134 // True if points to a value (not a "nullptr"). Use an extra field because
135 // 0 can be a valid offset.
136 bool IsValue = false;
137 };
138
139 class OnDiskPtr;
140 class ConstOnDiskPtr : public PointerImpl<ConstValueProxy> {
141 public:
142 ConstOnDiskPtr() = default;
143
144 private:
145 friend class OnDiskPtr;
147 using ConstOnDiskPtr::PointerImpl::PointerImpl;
148 };
149
150 class OnDiskPtr : public PointerImpl<ValueProxy> {
151 public:
152 operator ConstOnDiskPtr() const {
154 }
155
156 OnDiskPtr() = default;
157
158 private:
160 using OnDiskPtr::PointerImpl::PointerImpl;
161 };
162
163 /// Find the value from hash.
164 ///
165 /// \returns pointer to the value if exists, otherwise returns a non-value
166 /// pointer that evaluates to `false` when convert to boolean.
168
169 /// Helper function to recover a pointer into the trie from file offset.
172
174 function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue)>;
176 function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue,
177 FileOffset FinalOffset, ValueProxy FinalValue)>;
178
179 /// Insert lazily.
180 ///
181 /// \p OnConstruct is called when ready to insert a value, after allocating
182 /// space for the data. It is called at most once.
183 ///
184 /// \p OnLeak is called only if \p OnConstruct has been called and a race
185 /// occurred before insertion, causing the tentative offset and data to be
186 /// abandoned. This allows clients to clean up other results or update any
187 /// references.
188 ///
189 /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success.
190 /// The in-memory \a TrieRawHashMap uses LazyAtomicPointer to synchronize
191 /// simultaneous writes, but that seems dangerous to use in a memory-mapped
192 /// file in case a process crashes in the busy state.
195 LazyInsertOnConstructCB OnConstruct = nullptr,
196 LazyInsertOnLeakCB OnLeak = nullptr);
197
199 return insertLazy(Value.Hash, [&](FileOffset, ValueProxy Allocated) {
200 assert(Allocated.Hash == Value.Hash);
201 assert(Allocated.Data.size() == Value.Data.size());
202 llvm::copy(Value.Data, Allocated.Data.begin());
203 });
204 }
205
206 LLVM_ABI_FOR_TEST size_t size() const;
207 LLVM_ABI_FOR_TEST size_t capacity() const;
208
209 /// Gets or creates a file at \p Path with a hash-mapped trie named \p
210 /// TrieName. The hash size is \p NumHashBits (in bits) and the records store
211 /// data of size \p DataSize (in bytes).
212 ///
213 /// \p MaxFileSize controls the maximum file size to support, limiting the
214 /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting
215 /// size if a new file is created.
216 ///
217 /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to
218 /// configure the trie, if it doesn't already exist.
219 ///
220 /// \pre NumHashBits is a multiple of 8 (byte-aligned).
222 create(const Twine &Path, const Twine &TrieName, size_t NumHashBits,
223 uint64_t DataSize, uint64_t MaxFileSize,
224 std::optional<uint64_t> NewFileInitialSize,
225 std::shared_ptr<ondisk::OnDiskCASLogger> Logger = nullptr,
226 std::optional<size_t> NewTableNumRootBits = std::nullopt,
227 std::optional<size_t> NewTableNumSubtrieBits = std::nullopt);
228
232
233private:
234 struct ImplType;
235 explicit OnDiskTrieRawHashMap(std::unique_ptr<ImplType> Impl);
236 std::unique_ptr<ImplType> Impl;
237};
238
239} // namespace cas
240} // namespace llvm
241
242#endif // LLVM_CAS_ONDISKTRIERAWHASHMAP_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
#define LLVM_ABI_FOR_TEST
Definition Compiler.h:218
This file declares interface for FileOffset that represent stored data at an offset from the beginnin...
This file contains some templates that are useful if you are working with the STL at all.
Value * RHS
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
Tagged union holding either a T or a Error.
Definition Error.h:485
Logging utility - given an ordered specification of features, and assuming a scalar reward,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM Value Representation.
Definition Value.h:75
FileOffset is a wrapper around uint64_t to represent the offset of data from the beginning of the fil...
Definition FileOffset.h:24
PointerImpl(ProxyT Value, FileOffset Offset, bool IsValue=true)
Expected< OnDiskPtr > insert(const ConstValueProxy &Value)
LLVM_ABI_FOR_TEST OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS)
function_ref< void(FileOffset TentativeOffset, ValueProxy TentativeValue)> LazyInsertOnConstructCB
LLVM_ABI_FOR_TEST Error validate(function_ref< Error(FileOffset, ConstValueProxy)> RecordVerifier) const
Validate the trie data structure.
LLVM_ABI_FOR_TEST ConstOnDiskPtr find(ArrayRef< uint8_t > Hash) const
Find the value from hash.
LLVM_DUMP_METHOD void dump() const
static LLVM_ABI_FOR_TEST Expected< OnDiskTrieRawHashMap > create(const Twine &Path, const Twine &TrieName, size_t NumHashBits, uint64_t DataSize, uint64_t MaxFileSize, std::optional< uint64_t > NewFileInitialSize, std::shared_ptr< ondisk::OnDiskCASLogger > Logger=nullptr, std::optional< size_t > NewTableNumRootBits=std::nullopt, std::optional< size_t > NewTableNumSubtrieBits=std::nullopt)
Gets or creates a file at Path with a hash-mapped trie named TrieName.
LLVM_ABI_FOR_TEST size_t size() const
static bool validOffset(FileOffset Offset)
Check the valid range of file offset for OnDiskTrieRawHashMap.
LLVM_ABI_FOR_TEST Expected< OnDiskPtr > insertLazy(ArrayRef< uint8_t > Hash, LazyInsertOnConstructCB OnConstruct=nullptr, LazyInsertOnLeakCB OnLeak=nullptr)
Insert lazily.
LLVM_ABI_FOR_TEST size_t capacity() const
LLVM_ABI_FOR_TEST Expected< ConstOnDiskPtr > recoverFromFileOffset(FileOffset Offset) const
Helper function to recover a pointer into the trie from file offset.
LLVM_ABI_FOR_TEST OnDiskTrieRawHashMap & operator=(OnDiskTrieRawHashMap &&RHS)
void print(raw_ostream &OS, function_ref< void(ArrayRef< char >)> PrintRecordData=nullptr) const
function_ref< void(FileOffset TentativeOffset, ValueProxy TentativeValue, FileOffset FinalOffset, ValueProxy FinalValue)> LazyInsertOnLeakCB
LLVM_ABI_FOR_TEST ~OnDiskTrieRawHashMap()
Interface for logging low-level on-disk cas operations.
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr uintptr_t uintptr_t DataSize
Definition InstrProf.h:267
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
Const value proxy to access the records stored in TrieRawHashMap.
ConstValueProxy(ArrayRef< uint8_t > Hash, StringRef Data)
ConstValueProxy(ArrayRef< uint8_t > Hash, ArrayRef< char > Data)
Value proxy to access the records stored in TrieRawHashMap.
ValueProxy(ArrayRef< uint8_t > Hash, MutableArrayRef< char > Data)