LLVM 23.0.0git
OnDiskGraphDB.h
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This declares OnDiskGraphDB, an ondisk CAS database with a fixed length
11/// hash. This is the class that implements the database storage scheme without
12/// exposing the hashing algorithm.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CAS_ONDISKGRAPHDB_H
17#define LLVM_CAS_ONDISKGRAPHDB_H
18
23#include <atomic>
24
25namespace llvm::cas::ondisk {
26
27/// Standard 8 byte reference inside OnDiskGraphDB.
28class InternalRef {
29public:
30 FileOffset getFileOffset() const { return FileOffset(Data); }
31 uint64_t getRawData() const { return Data; }
32
33 static InternalRef getFromRawData(uint64_t Data) { return InternalRef(Data); }
34 static InternalRef getFromOffset(FileOffset Offset) {
35 return InternalRef(Offset.get());
36 }
37
38 friend bool operator==(InternalRef LHS, InternalRef RHS) {
39 return LHS.Data == RHS.Data;
40 }
41
42private:
44 InternalRef(uint64_t Data) : Data(Data) {}
46};
47
48/// Compact 4 byte reference inside OnDiskGraphDB for smaller references.
49class InternalRef4B {
50public:
51 FileOffset getFileOffset() const { return FileOffset(Data); }
52 uint32_t getRawData() const { return Data; }
53
54 /// Shrink to 4B reference.
55 static std::optional<InternalRef4B> tryToShrink(InternalRef Ref) {
56 uint64_t Offset = Ref.getRawData();
57 if (Offset > UINT32_MAX)
58 return std::nullopt;
59 return InternalRef4B(Offset);
60 }
61
62 operator InternalRef() const {
64 }
65
66private:
67 friend class InternalRef;
68 InternalRef4B(uint32_t Data) : Data(Data) {}
70};
71
72/// Array of internal node references.
74public:
75 size_t size() const { return Size; }
76 bool empty() const { return !Size; }
77
79 : public iterator_facade_base<iterator, std::random_access_iterator_tag,
80 const InternalRef> {
81 public:
82 bool operator==(const iterator &RHS) const { return I == RHS.I; }
85 return *Ref;
87 }
103 if (auto *Ref = dyn_cast<const InternalRef *>(I))
104 I = Ref + N;
105 else
107 return *this;
108 }
110 if (auto *Ref = dyn_cast<const InternalRef *>(I))
111 I = Ref - N;
112 else
114 return *this;
115 }
116 InternalRef operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
117
118 iterator() = default;
119
120 uint64_t getOpaqueData() const { return uintptr_t(I.getOpaqueValue()); }
121
123 return iterator(
125 const InternalRef4B *>::getFromOpaqueValue((void *)
126 Opaque));
127 }
128
129 private:
131 explicit iterator(
133 : I(I) {}
135 };
136
137 bool operator==(const InternalRefArrayRef &RHS) const {
138 return size() == RHS.size() && std::equal(begin(), end(), RHS.begin());
139 }
140
141 iterator begin() const { return iterator(Begin); }
142 iterator end() const { return begin() + Size; }
143
144 /// Array accessor.
145 InternalRef operator[](ptrdiff_t N) const { return begin()[N]; }
146
147 bool is4B() const { return isa<const InternalRef4B *>(Begin); }
148 bool is8B() const { return isa<const InternalRef *>(Begin); }
149
151 if (is4B()) {
152 auto *B = cast<const InternalRef4B *>(Begin);
153 return ArrayRef((const uint8_t *)B, sizeof(InternalRef4B) * Size);
154 }
155 auto *B = cast<const InternalRef *>(Begin);
156 return ArrayRef((const uint8_t *)B, sizeof(InternalRef) * Size);
157 }
158
159 InternalRefArrayRef(std::nullopt_t = std::nullopt) {
160 // This is useful so that all the casts in the \p iterator functions can
161 // operate without needing to check for a null value.
162 static InternalRef PlaceHolder = InternalRef::getFromRawData(0);
163 Begin = &PlaceHolder;
164 }
165
167 : Begin(Refs.begin()), Size(Refs.size()) {}
168
170 : Begin(Refs.begin()), Size(Refs.size()) {}
171
172private:
174 size_t Size = 0;
175};
176
177/// Reference to a node. The node's data may not be stored in the database.
178/// An \p ObjectID instance can only be used with the \p OnDiskGraphDB instance
179/// it came from. \p ObjectIDs from different \p OnDiskGraphDB instances are not
180/// comparable.
181class ObjectID {
182public:
183 uint64_t getOpaqueData() const { return Opaque; }
184
185 static ObjectID fromOpaqueData(uint64_t Opaque) { return ObjectID(Opaque); }
186
187 friend bool operator==(const ObjectID &LHS, const ObjectID &RHS) {
188 return LHS.Opaque == RHS.Opaque;
189 }
190 friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS) {
191 return !(LHS == RHS);
192 }
193
194private:
195 explicit ObjectID(uint64_t Opaque) : Opaque(Opaque) {}
196 uint64_t Opaque;
197};
198
199/// Handle for a loaded node object.
201public:
202 explicit ObjectHandle(uint64_t Opaque) : Opaque(Opaque) {}
203 uint64_t getOpaqueData() const { return Opaque; }
204
206 static ObjectHandle fromMemory(uintptr_t Ptr);
207
208 friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) {
209 return LHS.Opaque == RHS.Opaque;
210 }
211 friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) {
212 return !(LHS == RHS);
213 }
214
215private:
216 uint64_t Opaque;
217};
218
219/// Iterator for ObjectID.
221 : public iterator_facade_base<object_refs_iterator,
222 std::random_access_iterator_tag, ObjectID> {
223public:
224 bool operator==(const object_refs_iterator &RHS) const { return I == RHS.I; }
226 return ObjectID::fromOpaqueData((*I).getRawData());
227 }
228 bool operator<(const object_refs_iterator &RHS) const { return I < RHS.I; }
230 return I - RHS.I;
231 }
233 I += N;
234 return *this;
235 }
237 I -= N;
238 return *this;
239 }
240 ObjectID operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
241
244
245 uint64_t getOpaqueData() const { return I.getOpaqueData(); }
246
250
251private:
253};
254
256
257/// On-disk CAS nodes database, independent of a particular hashing algorithm.
258class OnDiskGraphDB {
259public:
260 /// Associate data & references with a particular object ID. If there is
261 /// already a record for this object the operation is a no-op. \param ID the
262 /// object ID to associate the data & references with. \param Refs references
263 /// \param Data data buffer.
266
267 /// \returns \p nullopt if the object associated with \p Ref does not exist.
269
270 /// \returns the hash bytes digest for the object reference.
272 // ObjectID should be valid to fetch Digest.
273 return cantFail(getDigest(getInternalRef(Ref)));
274 }
275
276 /// Form a reference for the provided hash. The reference can be used as part
277 /// of a CAS object even if it's not associated with an object yet.
279
280 /// Get an existing reference to the object \p Digest.
281 ///
282 /// Returns \p nullopt if the object is not stored in this CAS.
283 LLVM_ABI_FOR_TEST std::optional<ObjectID>
284 getExistingReference(ArrayRef<uint8_t> Digest, bool CheckUpstream = true);
285
286 /// Check whether the object associated with \p Ref is stored in the CAS.
287 /// Note that this function will fault-in according to the policy.
289
290 /// Check whether the object associated with \p Ref is stored in the CAS.
291 /// Note that this function does not fault-in.
292 bool containsObject(ObjectID Ref, bool CheckUpstream = true) const {
293 auto Presence = getObjectPresence(Ref, CheckUpstream);
294 if (!Presence) {
295 consumeError(Presence.takeError());
296 return false;
297 }
298 switch (*Presence) {
299 case ObjectPresence::Missing:
300 return false;
301 case ObjectPresence::InPrimaryDB:
302 return true;
303 case ObjectPresence::OnlyInUpstreamDB:
304 return true;
305 }
306 llvm_unreachable("Unknown ObjectPresence enum");
307 }
308
309 /// \returns the data part of the provided object handle.
311
312 /// \returns the object referenced by the provided object handle.
314 InternalRefArrayRef Refs = getInternalRefs(Node);
315 return make_range(Refs.begin(), Refs.end());
316 }
317
318 /// \returns Total size of stored objects.
319 ///
320 /// NOTE: There's a possibility that the returned size is not including a
321 /// large object if the process crashed right at the point of inserting it.
322 LLVM_ABI_FOR_TEST size_t getStorageSize() const;
323
324 /// \returns The precentage of space utilization of hard space limits.
325 ///
326 /// Return value is an integer between 0 and 100 for percentage.
327 unsigned getHardStorageLimitUtilization() const;
328
329 void print(raw_ostream &OS) const;
330
331 /// Hashing function type for validation.
334
335 /// Validate the OnDiskGraphDB.
336 ///
337 /// \param Deep if true, rehash all the objects to ensure no data
338 /// corruption in stored objects, otherwise just validate the structure of
339 /// CAS database.
340 /// \param Hasher is the hashing function used for objects inside CAS.
341 Error validate(bool Deep, HashingFuncT Hasher) const;
342
343 /// Checks that \p ID exists in the index. It is allowed to not have data
344 /// associated with it.
346
347 /// How to fault-in nodes if an upstream database is used.
348 enum class FaultInPolicy {
349 /// Copy only the requested node.
351 /// Copy the the entire graph of a node.
353 };
354
355 /// Open the on-disk store from a directory.
356 ///
357 /// \param Path directory for the on-disk store. The directory will be created
358 /// if it doesn't exist.
359 /// \param HashName Identifier name for the hashing algorithm that is going to
360 /// be used.
361 /// \param HashByteSize Size for the object digest hash bytes.
362 /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
363 /// if they don't exist in the primary store. The upstream store is only used
364 /// for reading nodes, new nodes are only written to the primary store. User
365 /// need to make sure \p UpstreamDB outlives current instance of
366 /// OnDiskGraphDB and the common usage is to have an \p UnifiedOnDiskCache to
367 /// manage both.
368 /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
369 /// to primary store. This is recorded at creation time and subsequent opens
370 /// need to pass the same policy otherwise the \p open will fail.
372 open(StringRef Path, StringRef HashName, unsigned HashByteSize,
373 OnDiskGraphDB *UpstreamDB = nullptr,
374 std::shared_ptr<OnDiskCASLogger> Logger = nullptr,
375 FaultInPolicy Policy = FaultInPolicy::FullTree);
376
378
379private:
380 /// Forward declaration for a proxy for an ondisk index record.
381 struct IndexProxy;
382
383 enum class ObjectPresence {
384 Missing,
385 InPrimaryDB,
386 OnlyInUpstreamDB,
387 };
388
389 /// Check if object exists and if it is on upstream only.
391 getObjectPresence(ObjectID Ref, bool CheckUpstream) const;
392
393 /// When \p load is called for a node that doesn't exist, this function tries
394 /// to load it from the upstream store and copy it to the primary one.
395 Expected<std::optional<ObjectHandle>> faultInFromUpstream(ObjectID PrimaryID);
396
397 /// Import the entire tree from upstream with \p UpstreamNode as root.
398 Error importFullTree(ObjectID PrimaryID, ObjectHandle UpstreamNode);
399 /// Import only the \param UpstreamNode.
400 Error importSingleNode(ObjectID PrimaryID, ObjectHandle UpstreamNode);
401
402 /// Found the IndexProxy for the hash.
404
405 /// Get path for creating standalone data file.
406 void getStandalonePath(StringRef FileSuffix, const IndexProxy &I,
407 SmallVectorImpl<char> &Path) const;
408 /// Create a standalone leaf file.
409 Error createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data);
410
411 /// \name Helper functions for internal data structures.
412 /// \{
413 static InternalRef getInternalRef(ObjectID Ref) {
414 return InternalRef::getFromRawData(Ref.getOpaqueData());
415 }
416
417 static ObjectID getExternalReference(InternalRef Ref) {
418 return ObjectID::fromOpaqueData(Ref.getRawData());
419 }
420
421 static ObjectID getExternalReference(const IndexProxy &I);
422
423 static InternalRef makeInternalRef(FileOffset IndexOffset);
424
425 LLVM_ABI_FOR_TEST Expected<ArrayRef<uint8_t>>
426 getDigest(InternalRef Ref) const;
427
428 ArrayRef<uint8_t> getDigest(const IndexProxy &I) const;
429
430 Expected<IndexProxy> getIndexProxyFromRef(InternalRef Ref) const;
431
432 IndexProxy
433 getIndexProxyFromPointer(OnDiskTrieRawHashMap::ConstOnDiskPtr P) const;
434
435 LLVM_ABI_FOR_TEST InternalRefArrayRef
436 getInternalRefs(ObjectHandle Node) const;
437 /// \}
438
439 /// Get the atomic variable that keeps track of the standalone data storage
440 /// size.
441 std::atomic<uint64_t> &standaloneStorageSize() const;
442
443 /// Increase the standalone data size.
444 void recordStandaloneSizeIncrease(size_t SizeIncrease);
445 /// Get the standalone data size.
446 uint64_t getStandaloneStorageSize() const;
447
448 // Private constructor.
449 OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
450 OnDiskDataAllocator DataPool, OnDiskGraphDB *UpstreamDB,
451 FaultInPolicy Policy, std::shared_ptr<OnDiskCASLogger> Logger);
452
453 /// Mapping from hash to object reference.
454 ///
455 /// Data type is TrieRecord.
456 OnDiskTrieRawHashMap Index;
457
458 /// Storage for most objects.
459 ///
460 /// Data type is DataRecordHandle.
461 OnDiskDataAllocator DataPool;
462
463 /// A StandaloneDataMap.
464 void *StandaloneData = nullptr;
465
466 /// Path to the root directory.
467 std::string RootPath;
468
469 /// Optional on-disk store to be used for faulting-in nodes.
470 OnDiskGraphDB *UpstreamDB = nullptr;
471
472 /// The policy used to fault in data from upstream.
473 FaultInPolicy FIPolicy;
474
475 /// Debug Logger.
476 std::shared_ptr<OnDiskCASLogger> Logger;
477};
478
479} // namespace llvm::cas::ondisk
480
481#endif // LLVM_CAS_ONDISKGRAPHDB_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Mark last scratch load
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_ABI_FOR_TEST
Definition Compiler.h:218
#define I(x, y, z)
Definition MD5.cpp:57
This file declares interface for OnDiskCASLogger, an interface that can be used to log CAS events to ...
This file declares interface for OnDiskDataAllocator, a file backed data pool can be used to allocate...
This file declares interface for OnDiskTrieRawHashMap, a thread-safe and (mostly) lock-free hash map ...
#define P(N)
This file defines the PointerUnion class, which is a discriminated union of pointer types.
Value * RHS
Value * LHS
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
Tagged union holding either a T or a Error.
Definition Error.h:485
Logging utility - given an ordered specification of features, and assuming a scalar reward,...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
FileOffset is a wrapper around uint64_t to represent the offset of data from the beginning of the fil...
Definition FileOffset.h:24
Handle to a loaded object in a ObjectStore instance.
Compact 4 byte reference inside OnDiskGraphDB for smaller references.
static std::optional< InternalRef4B > tryToShrink(InternalRef Ref)
Shrink to 4B reference.
ptrdiff_t operator-(const iterator &RHS) const
bool operator==(const iterator &RHS) const
static iterator fromOpaqueData(uint64_t Opaque)
bool operator<(const iterator &RHS) const
Array of internal node references.
InternalRef operator[](ptrdiff_t N) const
Array accessor.
ArrayRef< uint8_t > getBuffer() const
InternalRefArrayRef(std::nullopt_t=std::nullopt)
InternalRefArrayRef(ArrayRef< InternalRef4B > Refs)
bool operator==(const InternalRefArrayRef &RHS) const
InternalRefArrayRef(ArrayRef< InternalRef > Refs)
Standard 8 byte reference inside OnDiskGraphDB.
friend bool operator==(InternalRef LHS, InternalRef RHS)
FileOffset getFileOffset() const
static InternalRef getFromRawData(uint64_t Data)
static InternalRef getFromOffset(FileOffset Offset)
Handle for a loaded node object.
static ObjectHandle fromFileOffset(FileOffset Offset)
static ObjectHandle fromMemory(uintptr_t Ptr)
friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS)
friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS)
Reference to a node.
friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS)
uint64_t getOpaqueData() const
friend bool operator==(const ObjectID &LHS, const ObjectID &RHS)
static ObjectID fromOpaqueData(uint64_t Opaque)
On-disk CAS nodes database, independent of a particular hashing algorithm.
FaultInPolicy
How to fault-in nodes if an upstream database is used.
@ FullTree
Copy the the entire graph of a node.
@ SingleNode
Copy only the requested node.
void print(raw_ostream &OS) const
LLVM_ABI_FOR_TEST Error validateObjectID(ObjectID ID) const
Checks that ID exists in the index.
Expected< bool > isMaterialized(ObjectID Ref)
Check whether the object associated with Ref is stored in the CAS.
Error validate(bool Deep, HashingFuncT Hasher) const
Validate the OnDiskGraphDB.
object_refs_range getObjectRefs(ObjectHandle Node) const
unsigned getHardStorageLimitUtilization() const
LLVM_ABI_FOR_TEST Error store(ObjectID ID, ArrayRef< ObjectID > Refs, ArrayRef< char > Data)
Associate data & references with a particular object ID.
ArrayRef< uint8_t > getDigest(ObjectID Ref) const
LLVM_ABI_FOR_TEST size_t getStorageSize() const
static LLVM_ABI_FOR_TEST Expected< std::unique_ptr< OnDiskGraphDB > > open(StringRef Path, StringRef HashName, unsigned HashByteSize, OnDiskGraphDB *UpstreamDB=nullptr, std::shared_ptr< OnDiskCASLogger > Logger=nullptr, FaultInPolicy Policy=FaultInPolicy::FullTree)
Open the on-disk store from a directory.
bool containsObject(ObjectID Ref, bool CheckUpstream=true) const
Check whether the object associated with Ref is stored in the CAS.
LLVM_ABI_FOR_TEST Expected< ObjectID > getReference(ArrayRef< uint8_t > Hash)
Form a reference for the provided hash.
function_ref< void( ArrayRef< ArrayRef< uint8_t > >, ArrayRef< char >, SmallVectorImpl< uint8_t > &)> HashingFuncT
Hashing function type for validation.
LLVM_ABI_FOR_TEST ArrayRef< char > getObjectData(ObjectHandle Node) const
LLVM_ABI_FOR_TEST std::optional< ObjectID > getExistingReference(ArrayRef< uint8_t > Digest, bool CheckUpstream=true)
Get an existing reference to the object Digest.
object_refs_iterator & operator-=(ptrdiff_t N)
bool operator<(const object_refs_iterator &RHS) const
object_refs_iterator & operator+=(ptrdiff_t N)
ptrdiff_t operator-(const object_refs_iterator &RHS) const
bool operator==(const object_refs_iterator &RHS) const
ObjectID operator[](ptrdiff_t N) const
static object_refs_iterator fromOpaqueData(uint64_t Opaque)
object_refs_iterator(InternalRefArrayRef::iterator I)
An efficient, type-erasing, non-owning reference to a callable.
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition iterator.h:80
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
llvm::iterator_range< object_refs_iterator > object_refs_range
@ Offset
Definition DWP.cpp:532
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Ref
The access may reference the value stored in memory.
Definition ModRef.h:32
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void consumeError(Error Err)
Consume a Error without doing anything.
Definition Error.h:1083
#define N