Line data Source code
1 : //===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : ///
10 : /// \file
11 : /// This defines posting list interface: a storage for identifiers of symbols
12 : /// which can be characterized by a specific feature (such as fuzzy-find
13 : /// trigram, scope, type or any other Search Token). Posting lists can be
14 : /// traversed in order using an iterator and are values for inverted index,
15 : /// which maps search tokens to corresponding posting lists.
16 : ///
17 : /// In order to decrease size of Index in-memory representation, Variable Byte
18 : /// Encoding (VByte) is used for PostingLists compression. An overview of VByte
19 : /// algorithm can be found in "Introduction to Information Retrieval" book:
20 : /// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html
21 : ///
22 : //===----------------------------------------------------------------------===//
23 :
24 : #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
25 : #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
26 :
27 : #include "Iterator.h"
28 : #include "llvm/ADT/ArrayRef.h"
29 : #include "llvm/ADT/SmallVector.h"
30 : #include <cstdint>
31 : #include <vector>
32 :
33 : namespace clang {
34 : namespace clangd {
35 : namespace dex {
36 : struct Token;
37 :
38 : /// NOTE: This is an implementation detail.
39 : ///
40 : /// Chunk is a fixed-width piece of PostingList which contains the first DocID
41 : /// in uncompressed format (Head) and delta-encoded Payload. It can be
42 : /// decompressed upon request.
43 : struct Chunk {
44 : /// Keep sizeof(Chunk) == 32.
45 : static constexpr size_t PayloadSize = 32 - sizeof(DocID);
46 :
47 : llvm::SmallVector<DocID, PayloadSize + 1> decompress() const;
48 :
49 : /// The first element of decompressed Chunk.
50 : DocID Head;
51 : /// VByte-encoded deltas.
52 : std::array<uint8_t, PayloadSize> Payload;
53 : };
54 : static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory.");
55 :
56 : /// PostingList is the storage of DocIDs which can be inserted to the Query
57 : /// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs
58 : /// are stored in underlying chunks. Compression saves memory at a small cost
59 : /// in access time, which is still fast enough in practice.
60 0 : class PostingList {
61 : public:
62 : explicit PostingList(llvm::ArrayRef<DocID> Documents);
63 :
64 : /// Constructs DocumentIterator over given posting list. DocumentIterator will
65 : /// go through the chunks and decompress them on-the-fly when necessary.
66 : /// If given, Tok is only used for the string representation.
67 : std::unique_ptr<Iterator> iterator(const Token *Tok = nullptr) const;
68 :
69 : /// Returns in-memory size of external storage.
70 : size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); }
71 :
72 : private:
73 : const std::vector<Chunk> Chunks;
74 : };
75 :
76 : } // namespace dex
77 : } // namespace clangd
78 : } // namespace clang
79 :
80 : #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
|