LLVM 22.0.0git
OnDiskGraphDB.h
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This declares OnDiskGraphDB, an ondisk CAS database with a fixed length
11/// hash. This is the class that implements the database storage scheme without
12/// exposing the hashing algorithm.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CAS_ONDISKGRAPHDB_H
17#define LLVM_CAS_ONDISKGRAPHDB_H
18
22#include <atomic>
23
25
26/// Standard 8 byte reference inside OnDiskGraphDB.
27class InternalRef {
28public:
29 FileOffset getFileOffset() const { return FileOffset(Data); }
30 uint64_t getRawData() const { return Data; }
31
32 static InternalRef getFromRawData(uint64_t Data) { return InternalRef(Data); }
33 static InternalRef getFromOffset(FileOffset Offset) {
34 return InternalRef(Offset.get());
35 }
36
37 friend bool operator==(InternalRef LHS, InternalRef RHS) {
38 return LHS.Data == RHS.Data;
39 }
40
41private:
43 InternalRef(uint64_t Data) : Data(Data) {}
45};
46
47/// Compact 4 byte reference inside OnDiskGraphDB for smaller references.
48class InternalRef4B {
49public:
50 FileOffset getFileOffset() const { return FileOffset(Data); }
51 uint32_t getRawData() const { return Data; }
52
53 /// Shrink to 4B reference.
54 static std::optional<InternalRef4B> tryToShrink(InternalRef Ref) {
55 uint64_t Offset = Ref.getRawData();
56 if (Offset > UINT32_MAX)
57 return std::nullopt;
58 return InternalRef4B(Offset);
59 }
60
61 operator InternalRef() const {
63 }
64
65private:
66 friend class InternalRef;
67 InternalRef4B(uint32_t Data) : Data(Data) {}
69};
70
71/// Array of internal node references.
73public:
74 size_t size() const { return Size; }
75 bool empty() const { return !Size; }
76
78 : public iterator_facade_base<iterator, std::random_access_iterator_tag,
79 const InternalRef> {
80 public:
81 bool operator==(const iterator &RHS) const { return I == RHS.I; }
84 return *Ref;
86 }
102 if (auto *Ref = dyn_cast<const InternalRef *>(I))
103 I = Ref + N;
104 else
106 return *this;
107 }
109 if (auto *Ref = dyn_cast<const InternalRef *>(I))
110 I = Ref - N;
111 else
113 return *this;
114 }
115 InternalRef operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
116
117 iterator() = default;
118
119 uint64_t getOpaqueData() const { return uintptr_t(I.getOpaqueValue()); }
120
122 return iterator(
124 const InternalRef4B *>::getFromOpaqueValue((void *)
125 Opaque));
126 }
127
128 private:
130 explicit iterator(
132 : I(I) {}
134 };
135
136 bool operator==(const InternalRefArrayRef &RHS) const {
137 return size() == RHS.size() && std::equal(begin(), end(), RHS.begin());
138 }
139
140 iterator begin() const { return iterator(Begin); }
141 iterator end() const { return begin() + Size; }
142
143 /// Array accessor.
144 InternalRef operator[](ptrdiff_t N) const { return begin()[N]; }
145
146 bool is4B() const { return isa<const InternalRef4B *>(Begin); }
147 bool is8B() const { return isa<const InternalRef *>(Begin); }
148
150 if (is4B()) {
151 auto *B = cast<const InternalRef4B *>(Begin);
152 return ArrayRef((const uint8_t *)B, sizeof(InternalRef4B) * Size);
153 }
154 auto *B = cast<const InternalRef *>(Begin);
155 return ArrayRef((const uint8_t *)B, sizeof(InternalRef) * Size);
156 }
157
158 InternalRefArrayRef(std::nullopt_t = std::nullopt) {
159 // This is useful so that all the casts in the \p iterator functions can
160 // operate without needing to check for a null value.
161 static InternalRef PlaceHolder = InternalRef::getFromRawData(0);
162 Begin = &PlaceHolder;
163 }
164
166 : Begin(Refs.begin()), Size(Refs.size()) {}
167
169 : Begin(Refs.begin()), Size(Refs.size()) {}
170
171private:
173 size_t Size = 0;
174};
175
176/// Reference to a node. The node's data may not be stored in the database.
177/// An \p ObjectID instance can only be used with the \p OnDiskGraphDB instance
178/// it came from. \p ObjectIDs from different \p OnDiskGraphDB instances are not
179/// comparable.
180class ObjectID {
181public:
182 uint64_t getOpaqueData() const { return Opaque; }
183
184 static ObjectID fromOpaqueData(uint64_t Opaque) { return ObjectID(Opaque); }
185
186 friend bool operator==(const ObjectID &LHS, const ObjectID &RHS) {
187 return LHS.Opaque == RHS.Opaque;
188 }
189 friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS) {
190 return !(LHS == RHS);
191 }
192
193private:
194 explicit ObjectID(uint64_t Opaque) : Opaque(Opaque) {}
195 uint64_t Opaque;
196};
197
198/// Handle for a loaded node object.
200public:
201 explicit ObjectHandle(uint64_t Opaque) : Opaque(Opaque) {}
202 uint64_t getOpaqueData() const { return Opaque; }
203
205 static ObjectHandle fromMemory(uintptr_t Ptr);
206
207 friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) {
208 return LHS.Opaque == RHS.Opaque;
209 }
210 friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) {
211 return !(LHS == RHS);
212 }
213
214private:
215 uint64_t Opaque;
216};
217
218/// Iterator for ObjectID.
220 : public iterator_facade_base<object_refs_iterator,
221 std::random_access_iterator_tag, ObjectID> {
222public:
223 bool operator==(const object_refs_iterator &RHS) const { return I == RHS.I; }
225 return ObjectID::fromOpaqueData((*I).getRawData());
226 }
227 bool operator<(const object_refs_iterator &RHS) const { return I < RHS.I; }
229 return I - RHS.I;
230 }
232 I += N;
233 return *this;
234 }
236 I -= N;
237 return *this;
238 }
239 ObjectID operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
240
243
244 uint64_t getOpaqueData() const { return I.getOpaqueData(); }
245
249
250private:
252};
253
255
256/// On-disk CAS nodes database, independent of a particular hashing algorithm.
257class OnDiskGraphDB {
258public:
259 /// Associate data & references with a particular object ID. If there is
260 /// already a record for this object the operation is a no-op. \param ID the
261 /// object ID to associate the data & references with. \param Refs references
262 /// \param Data data buffer.
265
266 /// \returns \p nullopt if the object associated with \p Ref does not exist.
268
269 /// \returns the hash bytes digest for the object reference.
271 // ObjectID should be valid to fetch Digest.
272 return cantFail(getDigest(getInternalRef(Ref)));
273 }
274
275 /// Form a reference for the provided hash. The reference can be used as part
276 /// of a CAS object even if it's not associated with an object yet.
278
279 /// Get an existing reference to the object \p Digest.
280 ///
281 /// Returns \p nullopt if the object is not stored in this CAS.
282 LLVM_ABI_FOR_TEST std::optional<ObjectID>
283 getExistingReference(ArrayRef<uint8_t> Digest, bool CheckUpstream = true);
284
285 /// Check whether the object associated with \p Ref is stored in the CAS.
286 /// Note that this function will fault-in according to the policy.
288
289 /// Check whether the object associated with \p Ref is stored in the CAS.
290 /// Note that this function does not fault-in.
291 bool containsObject(ObjectID Ref, bool CheckUpstream = true) const {
292 auto Presence = getObjectPresence(Ref, CheckUpstream);
293 if (!Presence) {
294 consumeError(Presence.takeError());
295 return false;
296 }
297 switch (*Presence) {
298 case ObjectPresence::Missing:
299 return false;
300 case ObjectPresence::InPrimaryDB:
301 return true;
302 case ObjectPresence::OnlyInUpstreamDB:
303 return true;
304 }
305 llvm_unreachable("Unknown ObjectPresence enum");
306 }
307
308 /// \returns the data part of the provided object handle.
310
311 /// \returns the object referenced by the provided object handle.
313 InternalRefArrayRef Refs = getInternalRefs(Node);
314 return make_range(Refs.begin(), Refs.end());
315 }
316
317 /// \returns Total size of stored objects.
318 ///
319 /// NOTE: There's a possibility that the returned size is not including a
320 /// large object if the process crashed right at the point of inserting it.
321 LLVM_ABI_FOR_TEST size_t getStorageSize() const;
322
323 /// \returns The precentage of space utilization of hard space limits.
324 ///
325 /// Return value is an integer between 0 and 100 for percentage.
326 unsigned getHardStorageLimitUtilization() const;
327
328 void print(raw_ostream &OS) const;
329
330 /// Hashing function type for validation.
333
334 /// Validate the OnDiskGraphDB.
335 ///
336 /// \param Deep if true, rehash all the objects to ensure no data
337 /// corruption in stored objects, otherwise just validate the structure of
338 /// CAS database.
339 /// \param Hasher is the hashing function used for objects inside CAS.
340 Error validate(bool Deep, HashingFuncT Hasher) const;
341
342 /// How to fault-in nodes if an upstream database is used.
343 enum class FaultInPolicy {
344 /// Copy only the requested node.
346 /// Copy the the entire graph of a node.
348 };
349
350 /// Open the on-disk store from a directory.
351 ///
352 /// \param Path directory for the on-disk store. The directory will be created
353 /// if it doesn't exist.
354 /// \param HashName Identifier name for the hashing algorithm that is going to
355 /// be used.
356 /// \param HashByteSize Size for the object digest hash bytes.
357 /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
358 /// if they don't exist in the primary store. The upstream store is only used
359 /// for reading nodes, new nodes are only written to the primary store. User
360 /// need to make sure \p UpstreamDB outlives current instance of
361 /// OnDiskGraphDB and the common usage is to have an \p UnifiedOnDiskCache to
362 /// manage both.
363 /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
364 /// to primary store. This is recorded at creation time and subsequent opens
365 /// need to pass the same policy otherwise the \p open will fail.
367 open(StringRef Path, StringRef HashName, unsigned HashByteSize,
368 OnDiskGraphDB *UpstreamDB = nullptr,
369 FaultInPolicy Policy = FaultInPolicy::FullTree);
370
372
373private:
374 /// Forward declaration for a proxy for an ondisk index record.
375 struct IndexProxy;
376
377 enum class ObjectPresence {
378 Missing,
379 InPrimaryDB,
380 OnlyInUpstreamDB,
381 };
382
383 /// Check if object exists and if it is on upstream only.
385 getObjectPresence(ObjectID Ref, bool CheckUpstream) const;
386
387 /// When \p load is called for a node that doesn't exist, this function tries
388 /// to load it from the upstream store and copy it to the primary one.
389 Expected<std::optional<ObjectHandle>> faultInFromUpstream(ObjectID PrimaryID);
390
391 /// Import the entire tree from upstream with \p UpstreamNode as root.
392 Error importFullTree(ObjectID PrimaryID, ObjectHandle UpstreamNode);
393 /// Import only the \param UpstreamNode.
394 Error importSingleNode(ObjectID PrimaryID, ObjectHandle UpstreamNode);
395
396 /// Found the IndexProxy for the hash.
398
399 /// Get path for creating standalone data file.
400 void getStandalonePath(StringRef FileSuffix, const IndexProxy &I,
401 SmallVectorImpl<char> &Path) const;
402 /// Create a standalone leaf file.
403 Error createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data);
404
405 /// \name Helper functions for internal data structures.
406 /// \{
407 static InternalRef getInternalRef(ObjectID Ref) {
408 return InternalRef::getFromRawData(Ref.getOpaqueData());
409 }
410
411 static ObjectID getExternalReference(InternalRef Ref) {
412 return ObjectID::fromOpaqueData(Ref.getRawData());
413 }
414
415 static ObjectID getExternalReference(const IndexProxy &I);
416
417 static InternalRef makeInternalRef(FileOffset IndexOffset);
418
419 LLVM_ABI_FOR_TEST Expected<ArrayRef<uint8_t>>
420 getDigest(InternalRef Ref) const;
421
422 ArrayRef<uint8_t> getDigest(const IndexProxy &I) const;
423
424 Expected<IndexProxy> getIndexProxyFromRef(InternalRef Ref) const;
425
426 IndexProxy
427 getIndexProxyFromPointer(OnDiskTrieRawHashMap::ConstOnDiskPtr P) const;
428
429 LLVM_ABI_FOR_TEST InternalRefArrayRef
430 getInternalRefs(ObjectHandle Node) const;
431 /// \}
432
433 /// Get the atomic variable that keeps track of the standalone data storage
434 /// size.
435 std::atomic<uint64_t> &standaloneStorageSize() const;
436
437 /// Increase the standalone data size.
438 void recordStandaloneSizeIncrease(size_t SizeIncrease);
439 /// Get the standalone data size.
440 uint64_t getStandaloneStorageSize() const;
441
442 // Private constructor.
443 OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
444 OnDiskDataAllocator DataPool, OnDiskGraphDB *UpstreamDB,
445 FaultInPolicy Policy);
446
447 /// Mapping from hash to object reference.
448 ///
449 /// Data type is TrieRecord.
450 OnDiskTrieRawHashMap Index;
451
452 /// Storage for most objects.
453 ///
454 /// Data type is DataRecordHandle.
455 OnDiskDataAllocator DataPool;
456
457 /// A StandaloneDataMap.
458 void *StandaloneData = nullptr;
459
460 /// Path to the root directory.
461 std::string RootPath;
462
463 /// Optional on-disk store to be used for faulting-in nodes.
464 OnDiskGraphDB *UpstreamDB = nullptr;
465
466 /// The policy used to fault in data from upstream.
467 FaultInPolicy FIPolicy;
468};
469
470} // namespace llvm::cas::ondisk
471
472#endif // LLVM_CAS_ONDISKGRAPHDB_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Mark last scratch load
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ABI_FOR_TEST
Definition Compiler.h:218
#define I(x, y, z)
Definition MD5.cpp:57
This file declares interface for OnDiskDataAllocator, a file backed data pool can be used to allocate...
This file declares interface for OnDiskTrieRawHashMap, a thread-safe and (mostly) lock-free hash map ...
#define P(N)
This file defines the PointerUnion class, which is a discriminated union of pointer types.
Value * RHS
Value * LHS
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
Tagged union holding either a T or a Error.
Definition Error.h:485
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
FileOffset is a wrapper around uint64_t to represent the offset of data from the beginning of the fil...
Definition FileOffset.h:24
Handle to a loaded object in a ObjectStore instance.
Compact 4 byte reference inside OnDiskGraphDB for smaller references.
static std::optional< InternalRef4B > tryToShrink(InternalRef Ref)
Shrink to 4B reference.
ptrdiff_t operator-(const iterator &RHS) const
bool operator==(const iterator &RHS) const
static iterator fromOpaqueData(uint64_t Opaque)
bool operator<(const iterator &RHS) const
Array of internal node references.
InternalRef operator[](ptrdiff_t N) const
Array accessor.
ArrayRef< uint8_t > getBuffer() const
InternalRefArrayRef(std::nullopt_t=std::nullopt)
InternalRefArrayRef(ArrayRef< InternalRef4B > Refs)
bool operator==(const InternalRefArrayRef &RHS) const
InternalRefArrayRef(ArrayRef< InternalRef > Refs)
Standard 8 byte reference inside OnDiskGraphDB.
friend bool operator==(InternalRef LHS, InternalRef RHS)
FileOffset getFileOffset() const
static InternalRef getFromRawData(uint64_t Data)
static InternalRef getFromOffset(FileOffset Offset)
Handle for a loaded node object.
static ObjectHandle fromFileOffset(FileOffset Offset)
static ObjectHandle fromMemory(uintptr_t Ptr)
friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS)
friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS)
Reference to a node.
friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS)
uint64_t getOpaqueData() const
friend bool operator==(const ObjectID &LHS, const ObjectID &RHS)
static ObjectID fromOpaqueData(uint64_t Opaque)
On-disk CAS nodes database, independent of a particular hashing algorithm.
FaultInPolicy
How to fault-in nodes if an upstream database is used.
@ FullTree
Copy the the entire graph of a node.
@ SingleNode
Copy only the requested node.
void print(raw_ostream &OS) const
Expected< bool > isMaterialized(ObjectID Ref)
Check whether the object associated with Ref is stored in the CAS.
Error validate(bool Deep, HashingFuncT Hasher) const
Validate the OnDiskGraphDB.
object_refs_range getObjectRefs(ObjectHandle Node) const
unsigned getHardStorageLimitUtilization() const
LLVM_ABI_FOR_TEST Error store(ObjectID ID, ArrayRef< ObjectID > Refs, ArrayRef< char > Data)
Associate data & references with a particular object ID.
ArrayRef< uint8_t > getDigest(ObjectID Ref) const
static LLVM_ABI_FOR_TEST Expected< std::unique_ptr< OnDiskGraphDB > > open(StringRef Path, StringRef HashName, unsigned HashByteSize, OnDiskGraphDB *UpstreamDB=nullptr, FaultInPolicy Policy=FaultInPolicy::FullTree)
Open the on-disk store from a directory.
LLVM_ABI_FOR_TEST size_t getStorageSize() const
bool containsObject(ObjectID Ref, bool CheckUpstream=true) const
Check whether the object associated with Ref is stored in the CAS.
LLVM_ABI_FOR_TEST Expected< ObjectID > getReference(ArrayRef< uint8_t > Hash)
Form a reference for the provided hash.
function_ref< void( ArrayRef< ArrayRef< uint8_t > >, ArrayRef< char >, SmallVectorImpl< uint8_t > &)> HashingFuncT
Hashing function type for validation.
LLVM_ABI_FOR_TEST ArrayRef< char > getObjectData(ObjectHandle Node) const
LLVM_ABI_FOR_TEST std::optional< ObjectID > getExistingReference(ArrayRef< uint8_t > Digest, bool CheckUpstream=true)
Get an existing reference to the object Digest.
object_refs_iterator & operator-=(ptrdiff_t N)
bool operator<(const object_refs_iterator &RHS) const
object_refs_iterator & operator+=(ptrdiff_t N)
ptrdiff_t operator-(const object_refs_iterator &RHS) const
bool operator==(const object_refs_iterator &RHS) const
ObjectID operator[](ptrdiff_t N) const
static object_refs_iterator fromOpaqueData(uint64_t Opaque)
object_refs_iterator(InternalRefArrayRef::iterator I)
An efficient, type-erasing, non-owning reference to a callable.
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
llvm::iterator_range< object_refs_iterator > object_refs_range
@ Offset
Definition DWP.cpp:532
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void consumeError(Error Err)
Consume a Error without doing anything.
Definition Error.h:1083
#define N