LLVM 22.0.0git
OnDiskGraphDB.h
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This declares OnDiskGraphDB, an ondisk CAS database with a fixed length
11/// hash. This is the class that implements the database storage scheme without
12/// exposing the hashing algorithm.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CAS_ONDISKGRAPHDB_H
17#define LLVM_CAS_ONDISKGRAPHDB_H
18
22
24
25/// Standard 8 byte reference inside OnDiskGraphDB.
26class InternalRef {
27public:
28 FileOffset getFileOffset() const { return FileOffset(Data); }
29 uint64_t getRawData() const { return Data; }
30
31 static InternalRef getFromRawData(uint64_t Data) { return InternalRef(Data); }
32 static InternalRef getFromOffset(FileOffset Offset) {
33 return InternalRef(Offset.get());
34 }
35
36 friend bool operator==(InternalRef LHS, InternalRef RHS) {
37 return LHS.Data == RHS.Data;
38 }
39
40private:
42 InternalRef(uint64_t Data) : Data(Data) {}
44};
45
46/// Compact 4 byte reference inside OnDiskGraphDB for smaller references.
47class InternalRef4B {
48public:
49 FileOffset getFileOffset() const { return FileOffset(Data); }
50 uint32_t getRawData() const { return Data; }
51
52 /// Shrink to 4B reference.
53 static std::optional<InternalRef4B> tryToShrink(InternalRef Ref) {
54 uint64_t Offset = Ref.getRawData();
55 if (Offset > UINT32_MAX)
56 return std::nullopt;
57 return InternalRef4B(Offset);
58 }
59
60 operator InternalRef() const {
62 }
63
64private:
65 friend class InternalRef;
66 InternalRef4B(uint32_t Data) : Data(Data) {}
68};
69
70/// Array of internal node references.
72public:
73 size_t size() const { return Size; }
74 bool empty() const { return !Size; }
75
77 : public iterator_facade_base<iterator, std::random_access_iterator_tag,
78 const InternalRef> {
79 public:
80 bool operator==(const iterator &RHS) const { return I == RHS.I; }
83 return *Ref;
85 }
101 if (auto *Ref = dyn_cast<const InternalRef *>(I))
102 I = Ref + N;
103 else
105 return *this;
106 }
108 if (auto *Ref = dyn_cast<const InternalRef *>(I))
109 I = Ref - N;
110 else
112 return *this;
113 }
114 InternalRef operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
115
116 iterator() = default;
117
118 uint64_t getOpaqueData() const { return uintptr_t(I.getOpaqueValue()); }
119
121 return iterator(
123 const InternalRef4B *>::getFromOpaqueValue((void *)
124 Opaque));
125 }
126
127 private:
129 explicit iterator(
131 : I(I) {}
133 };
134
135 bool operator==(const InternalRefArrayRef &RHS) const {
136 return size() == RHS.size() && std::equal(begin(), end(), RHS.begin());
137 }
138
139 iterator begin() const { return iterator(Begin); }
140 iterator end() const { return begin() + Size; }
141
142 /// Array accessor.
143 InternalRef operator[](ptrdiff_t N) const { return begin()[N]; }
144
145 bool is4B() const { return isa<const InternalRef4B *>(Begin); }
146 bool is8B() const { return isa<const InternalRef *>(Begin); }
147
149 if (is4B()) {
150 auto *B = cast<const InternalRef4B *>(Begin);
151 return ArrayRef((const uint8_t *)B, sizeof(InternalRef4B) * Size);
152 }
153 auto *B = cast<const InternalRef *>(Begin);
154 return ArrayRef((const uint8_t *)B, sizeof(InternalRef) * Size);
155 }
156
157 InternalRefArrayRef(std::nullopt_t = std::nullopt) {
158 // This is useful so that all the casts in the \p iterator functions can
159 // operate without needing to check for a null value.
160 static InternalRef PlaceHolder = InternalRef::getFromRawData(0);
161 Begin = &PlaceHolder;
162 }
163
165 : Begin(Refs.begin()), Size(Refs.size()) {}
166
168 : Begin(Refs.begin()), Size(Refs.size()) {}
169
170private:
172 size_t Size = 0;
173};
174
175/// Reference to a node. The node's data may not be stored in the database.
176/// An \p ObjectID instance can only be used with the \p OnDiskGraphDB instance
177/// it came from. \p ObjectIDs from different \p OnDiskGraphDB instances are not
178/// comparable.
179class ObjectID {
180public:
181 uint64_t getOpaqueData() const { return Opaque; }
182
183 static ObjectID fromOpaqueData(uint64_t Opaque) { return ObjectID(Opaque); }
184
185 friend bool operator==(const ObjectID &LHS, const ObjectID &RHS) {
186 return LHS.Opaque == RHS.Opaque;
187 }
188 friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS) {
189 return !(LHS == RHS);
190 }
191
192private:
193 explicit ObjectID(uint64_t Opaque) : Opaque(Opaque) {}
194 uint64_t Opaque;
195};
196
197/// Handle for a loaded node object.
199public:
200 explicit ObjectHandle(uint64_t Opaque) : Opaque(Opaque) {}
201 uint64_t getOpaqueData() const { return Opaque; }
202
204 static ObjectHandle fromMemory(uintptr_t Ptr);
205
206 friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) {
207 return LHS.Opaque == RHS.Opaque;
208 }
209 friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) {
210 return !(LHS == RHS);
211 }
212
213private:
214 uint64_t Opaque;
215};
216
217/// Iterator for ObjectID.
219 : public iterator_facade_base<object_refs_iterator,
220 std::random_access_iterator_tag, ObjectID> {
221public:
222 bool operator==(const object_refs_iterator &RHS) const { return I == RHS.I; }
224 return ObjectID::fromOpaqueData((*I).getRawData());
225 }
226 bool operator<(const object_refs_iterator &RHS) const { return I < RHS.I; }
228 return I - RHS.I;
229 }
231 I += N;
232 return *this;
233 }
235 I -= N;
236 return *this;
237 }
238 ObjectID operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
239
242
243 uint64_t getOpaqueData() const { return I.getOpaqueData(); }
244
248
249private:
251};
252
254
255/// On-disk CAS nodes database, independent of a particular hashing algorithm.
256class OnDiskGraphDB {
257public:
258 /// Associate data & references with a particular object ID. If there is
259 /// already a record for this object the operation is a no-op. \param ID the
260 /// object ID to associate the data & references with. \param Refs references
261 /// \param Data data buffer.
264
265 /// \returns \p nullopt if the object associated with \p Ref does not exist.
267
268 /// \returns the hash bytes digest for the object reference.
270 // ObjectID should be valid to fetch Digest.
271 return cantFail(getDigest(getInternalRef(Ref)));
272 }
273
274 /// Form a reference for the provided hash. The reference can be used as part
275 /// of a CAS object even if it's not associated with an object yet.
277
278 /// Get an existing reference to the object \p Digest.
279 ///
280 /// Returns \p nullopt if the object is not stored in this CAS.
281 LLVM_ABI_FOR_TEST std::optional<ObjectID>
282 getExistingReference(ArrayRef<uint8_t> Digest, bool CheckUpstream = true);
283
284 /// Check whether the object associated with \p Ref is stored in the CAS.
285 /// Note that this function will fault-in according to the policy.
287
288 /// Check whether the object associated with \p Ref is stored in the CAS.
289 /// Note that this function does not fault-in.
290 bool containsObject(ObjectID Ref, bool CheckUpstream = true) const {
291 auto Presence = getObjectPresence(Ref, CheckUpstream);
292 if (!Presence) {
293 consumeError(Presence.takeError());
294 return false;
295 }
296 switch (*Presence) {
297 case ObjectPresence::Missing:
298 return false;
299 case ObjectPresence::InPrimaryDB:
300 return true;
301 case ObjectPresence::OnlyInUpstreamDB:
302 return true;
303 }
304 llvm_unreachable("Unknown ObjectPresence enum");
305 }
306
307 /// \returns the data part of the provided object handle.
309
310 /// \returns the object referenced by the provided object handle.
312 InternalRefArrayRef Refs = getInternalRefs(Node);
313 return make_range(Refs.begin(), Refs.end());
314 }
315
316 /// \returns Total size of stored objects.
317 ///
318 /// NOTE: There's a possibility that the returned size is not including a
319 /// large object if the process crashed right at the point of inserting it.
320 LLVM_ABI_FOR_TEST size_t getStorageSize() const;
321
322 /// \returns The precentage of space utilization of hard space limits.
323 ///
324 /// Return value is an integer between 0 and 100 for percentage.
325 unsigned getHardStorageLimitUtilization() const;
326
327 void print(raw_ostream &OS) const;
328
329 /// Hashing function type for validation.
332
333 /// Validate the OnDiskGraphDB.
334 ///
335 /// \param Deep if true, rehash all the objects to ensure no data
336 /// corruption in stored objects, otherwise just validate the structure of
337 /// CAS database.
338 /// \param Hasher is the hashing function used for objects inside CAS.
339 Error validate(bool Deep, HashingFuncT Hasher) const;
340
341 /// How to fault-in nodes if an upstream database is used.
342 enum class FaultInPolicy {
343 /// Copy only the requested node.
345 /// Copy the the entire graph of a node.
347 };
348
349 /// Open the on-disk store from a directory.
350 ///
351 /// \param Path directory for the on-disk store. The directory will be created
352 /// if it doesn't exist.
353 /// \param HashName Identifier name for the hashing algorithm that is going to
354 /// be used.
355 /// \param HashByteSize Size for the object digest hash bytes.
356 /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
357 /// if they don't exist in the primary store. The upstream store is only used
358 /// for reading nodes, new nodes are only written to the primary store. User
359 /// need to make sure \p UpstreamDB outlives current instance of
360 /// OnDiskGraphDB and the common usage is to have an \p UnifiedOnDiskCache to
361 /// manage both.
362 /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
363 /// to primary store. This is recorded at creation time and subsequent opens
364 /// need to pass the same policy otherwise the \p open will fail.
366 open(StringRef Path, StringRef HashName, unsigned HashByteSize,
367 OnDiskGraphDB *UpstreamDB = nullptr,
368 FaultInPolicy Policy = FaultInPolicy::FullTree);
369
371
372private:
373 /// Forward declaration for a proxy for an ondisk index record.
374 struct IndexProxy;
375
376 enum class ObjectPresence {
377 Missing,
378 InPrimaryDB,
379 OnlyInUpstreamDB,
380 };
381
382 /// Check if object exists and if it is on upstream only.
384 getObjectPresence(ObjectID Ref, bool CheckUpstream) const;
385
386 /// When \p load is called for a node that doesn't exist, this function tries
387 /// to load it from the upstream store and copy it to the primary one.
388 Expected<std::optional<ObjectHandle>> faultInFromUpstream(ObjectID PrimaryID);
389
390 /// Import the entire tree from upstream with \p UpstreamNode as root.
391 Error importFullTree(ObjectID PrimaryID, ObjectHandle UpstreamNode);
392 /// Import only the \param UpstreamNode.
393 Error importSingleNode(ObjectID PrimaryID, ObjectHandle UpstreamNode);
394
395 /// Found the IndexProxy for the hash.
397
398 /// Get path for creating standalone data file.
399 void getStandalonePath(StringRef FileSuffix, const IndexProxy &I,
400 SmallVectorImpl<char> &Path) const;
401 /// Create a standalone leaf file.
402 Error createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data);
403
404 /// \name Helper functions for internal data structures.
405 /// \{
406 static InternalRef getInternalRef(ObjectID Ref) {
407 return InternalRef::getFromRawData(Ref.getOpaqueData());
408 }
409
410 static ObjectID getExternalReference(InternalRef Ref) {
411 return ObjectID::fromOpaqueData(Ref.getRawData());
412 }
413
414 static ObjectID getExternalReference(const IndexProxy &I);
415
416 static InternalRef makeInternalRef(FileOffset IndexOffset);
417
418 LLVM_ABI_FOR_TEST Expected<ArrayRef<uint8_t>>
419 getDigest(InternalRef Ref) const;
420
421 ArrayRef<uint8_t> getDigest(const IndexProxy &I) const;
422
423 Expected<IndexProxy> getIndexProxyFromRef(InternalRef Ref) const;
424
425 IndexProxy
426 getIndexProxyFromPointer(OnDiskTrieRawHashMap::ConstOnDiskPtr P) const;
427
428 LLVM_ABI_FOR_TEST InternalRefArrayRef
429 getInternalRefs(ObjectHandle Node) const;
430 /// \}
431
432 /// Get the atomic variable that keeps track of the standalone data storage
433 /// size.
434 std::atomic<uint64_t> &standaloneStorageSize() const;
435
436 /// Increase the standalone data size.
437 void recordStandaloneSizeIncrease(size_t SizeIncrease);
438 /// Get the standalone data size.
439 uint64_t getStandaloneStorageSize() const;
440
441 // Private constructor.
442 OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
443 OnDiskDataAllocator DataPool, OnDiskGraphDB *UpstreamDB,
444 FaultInPolicy Policy);
445
446 /// Mapping from hash to object reference.
447 ///
448 /// Data type is TrieRecord.
449 OnDiskTrieRawHashMap Index;
450
451 /// Storage for most objects.
452 ///
453 /// Data type is DataRecordHandle.
454 OnDiskDataAllocator DataPool;
455
456 /// A StandaloneDataMap.
457 void *StandaloneData = nullptr;
458
459 /// Path to the root directory.
460 std::string RootPath;
461
462 /// Optional on-disk store to be used for faulting-in nodes.
463 OnDiskGraphDB *UpstreamDB = nullptr;
464
465 /// The policy used to fault in data from upstream.
466 FaultInPolicy FIPolicy;
467};
468
469} // namespace llvm::cas::ondisk
470
471#endif // LLVM_CAS_ONDISKGRAPHDB_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Mark last scratch load
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ABI_FOR_TEST
Definition Compiler.h:218
#define I(x, y, z)
Definition MD5.cpp:57
This file declares interface for OnDiskDataAllocator, a file backed data pool can be used to allocate...
This file declares interface for OnDiskTrieRawHashMap, a thread-safe and (mostly) lock-free hash map ...
#define P(N)
This file defines the PointerUnion class, which is a discriminated union of pointer types.
Value * RHS
Value * LHS
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
Tagged union holding either a T or a Error.
Definition Error.h:485
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
FileOffset is a wrapper around uint64_t to represent the offset of data from the beginning of the fil...
Definition FileOffset.h:24
Handle to a loaded object in a ObjectStore instance.
Compact 4 byte reference inside OnDiskGraphDB for smaller references.
static std::optional< InternalRef4B > tryToShrink(InternalRef Ref)
Shrink to 4B reference.
ptrdiff_t operator-(const iterator &RHS) const
bool operator==(const iterator &RHS) const
static iterator fromOpaqueData(uint64_t Opaque)
bool operator<(const iterator &RHS) const
Array of internal node references.
InternalRef operator[](ptrdiff_t N) const
Array accessor.
ArrayRef< uint8_t > getBuffer() const
InternalRefArrayRef(std::nullopt_t=std::nullopt)
InternalRefArrayRef(ArrayRef< InternalRef4B > Refs)
bool operator==(const InternalRefArrayRef &RHS) const
InternalRefArrayRef(ArrayRef< InternalRef > Refs)
Standard 8 byte reference inside OnDiskGraphDB.
friend bool operator==(InternalRef LHS, InternalRef RHS)
FileOffset getFileOffset() const
static InternalRef getFromRawData(uint64_t Data)
static InternalRef getFromOffset(FileOffset Offset)
Handle for a loaded node object.
static ObjectHandle fromFileOffset(FileOffset Offset)
static ObjectHandle fromMemory(uintptr_t Ptr)
friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS)
friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS)
Reference to a node.
friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS)
uint64_t getOpaqueData() const
friend bool operator==(const ObjectID &LHS, const ObjectID &RHS)
static ObjectID fromOpaqueData(uint64_t Opaque)
On-disk CAS nodes database, independent of a particular hashing algorithm.
FaultInPolicy
How to fault-in nodes if an upstream database is used.
@ FullTree
Copy the the entire graph of a node.
@ SingleNode
Copy only the requested node.
void print(raw_ostream &OS) const
Expected< bool > isMaterialized(ObjectID Ref)
Check whether the object associated with Ref is stored in the CAS.
Error validate(bool Deep, HashingFuncT Hasher) const
Validate the OnDiskGraphDB.
object_refs_range getObjectRefs(ObjectHandle Node) const
unsigned getHardStorageLimitUtilization() const
LLVM_ABI_FOR_TEST Error store(ObjectID ID, ArrayRef< ObjectID > Refs, ArrayRef< char > Data)
Associate data & references with a particular object ID.
ArrayRef< uint8_t > getDigest(ObjectID Ref) const
static LLVM_ABI_FOR_TEST Expected< std::unique_ptr< OnDiskGraphDB > > open(StringRef Path, StringRef HashName, unsigned HashByteSize, OnDiskGraphDB *UpstreamDB=nullptr, FaultInPolicy Policy=FaultInPolicy::FullTree)
Open the on-disk store from a directory.
LLVM_ABI_FOR_TEST size_t getStorageSize() const
bool containsObject(ObjectID Ref, bool CheckUpstream=true) const
Check whether the object associated with Ref is stored in the CAS.
LLVM_ABI_FOR_TEST Expected< ObjectID > getReference(ArrayRef< uint8_t > Hash)
Form a reference for the provided hash.
function_ref< void( ArrayRef< ArrayRef< uint8_t > >, ArrayRef< char >, SmallVectorImpl< uint8_t > &)> HashingFuncT
Hashing function type for validation.
LLVM_ABI_FOR_TEST ArrayRef< char > getObjectData(ObjectHandle Node) const
LLVM_ABI_FOR_TEST std::optional< ObjectID > getExistingReference(ArrayRef< uint8_t > Digest, bool CheckUpstream=true)
Get an existing reference to the object Digest.
object_refs_iterator & operator-=(ptrdiff_t N)
bool operator<(const object_refs_iterator &RHS) const
object_refs_iterator & operator+=(ptrdiff_t N)
ptrdiff_t operator-(const object_refs_iterator &RHS) const
bool operator==(const object_refs_iterator &RHS) const
ObjectID operator[](ptrdiff_t N) const
static object_refs_iterator fromOpaqueData(uint64_t Opaque)
object_refs_iterator(InternalRefArrayRef::iterator I)
An efficient, type-erasing, non-owning reference to a callable.
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
llvm::iterator_range< object_refs_iterator > object_refs_range
@ Offset
Definition DWP.cpp:532
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void consumeError(Error Err)
Consume a Error without doing anything.
Definition Error.h:1083
#define N