Line data Source code
1 : //===--- Iterator.h - Query Symbol Retrieval --------------------*- C++ -*-===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : ///
10 : /// \file
11 : /// Symbol index queries consist of specific requirements for the requested
12 : /// symbol, such as high fuzzy matching score, scope, type etc. The lists of all
13 : /// symbols matching some criteria (e.g. belonging to "clang::clangd::" scope)
14 : /// are expressed in a form of Search Tokens which are stored in the inverted
15 : /// index. Inverted index maps these tokens to the posting lists - sorted (by
16 : /// symbol quality) sequences of symbol IDs matching the token, e.g. scope token
17 : /// "clangd::clangd::" is mapped to the list of IDs of all symbols which are
18 : /// declared in this namespace. Search queries are build from a set of
19 : /// requirements which can be combined with each other forming the query trees.
20 : /// The leafs of such trees are posting lists, and the nodes are operations on
21 : /// these posting lists, e.g. intersection or union. Efficient processing of
22 : /// these multi-level queries is handled by Iterators. Iterators advance through
23 : /// all leaf posting lists producing the result of search query, which preserves
24 : /// the sorted order of IDs. Having the resulting IDs sorted is important,
25 : /// because it allows receiving a certain number of the most valuable items
26 : /// (e.g. symbols with highest quality which was the sorting key in the first
27 : /// place) without processing all items with requested properties (this might
28 : /// not be computationally effective if search request is not very restrictive).
29 : //
30 : //===----------------------------------------------------------------------===//
31 :
32 : #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
33 : #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
34 :
35 : #include "llvm/ADT/ArrayRef.h"
36 : #include "llvm/Support/raw_ostream.h"
37 : #include <algorithm>
38 : #include <memory>
39 : #include <vector>
40 :
41 : namespace clang {
42 : namespace clangd {
43 : namespace dex {
44 :
45 : /// Symbol position in the list of all index symbols sorted by a pre-computed
46 : /// symbol quality.
47 : using DocID = uint32_t;
48 :
49 : /// Iterator is the interface for Query Tree node. The simplest type of Iterator
50 : /// is DocumentIterator which is simply a wrapper around PostingList iterator
51 : /// and serves as the Query Tree leaf. More sophisticated examples of iterators
52 : /// can manage intersection, union of the elements produced by other iterators
53 : /// (their children) to form a multi-level Query Tree. The interface is designed
54 : /// to be extensible in order to support multiple types of iterators.
55 : class Iterator {
56 : public:
57 : /// Returns true if all valid DocIDs were processed and hence the iterator is
58 : /// exhausted.
59 : virtual bool reachedEnd() const = 0;
60 : /// Moves to next valid DocID. If it doesn't exist, the iterator is exhausted
61 : /// and proceeds to the END.
62 : ///
63 : /// Note: reachedEnd() must be false.
64 : virtual void advance() = 0;
65 : /// Moves to the first valid DocID which is equal or higher than given ID. If
66 : /// it doesn't exist, the iterator is exhausted and proceeds to the END.
67 : ///
68 : /// Note: reachedEnd() must be false.
69 : virtual void advanceTo(DocID ID) = 0;
70 : /// Returns the current element this iterator points to.
71 : ///
72 : /// Note: reachedEnd() must be false.
73 : virtual DocID peek() const = 0;
74 : /// Informs the iterator that the current document was consumed, and returns
75 : /// its boost.
76 : ///
77 : /// Note: If this iterator has any child iterators that contain the document,
78 : /// consume() should be called on those and their boosts incorporated.
79 : /// consume() must *not* be called on children that don't contain the current
80 : /// doc.
81 : virtual float consume() = 0;
82 : /// Returns an estimate of advance() calls before the iterator is exhausted.
83 : virtual size_t estimateSize() const = 0;
84 :
85 : virtual ~Iterator() {}
86 :
87 : /// Prints a convenient human-readable iterator representation by recursively
88 : /// dumping iterators in the following format:
89 : ///
90 : /// (Type Child1 Child2 ...)
91 : ///
92 : /// Where Type is the iterator type representation: "&" for And, "|" for Or,
93 : /// ChildN is N-th iterator child. Raw iterators over PostingList are
94 : /// represented as "[... CurID ...]" where CurID is the current PostingList
95 : /// entry being inspected.
96 : friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
97 : const Iterator &Iterator) {
98 0 : return Iterator.dump(OS);
99 : }
100 :
101 : /// Inspect iterator type, used internally for optimizing query trees.
102 : enum class Kind { And, Or, True, False, Other };
103 : Kind kind() const { return MyKind; }
104 :
105 : protected:
106 : Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {}
107 :
108 : private:
109 : virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
110 : Kind MyKind;
111 : };
112 :
113 : /// Advances the iterator until it is exhausted. Returns pairs of document IDs
114 : /// with the corresponding boosting score.
115 : ///
116 : /// Boosting can be seen as a compromise between retrieving too many items and
117 : /// calculating finals score for each of them (which might be very expensive)
118 : /// and not retrieving enough items so that items with very high final score
119 : /// would not be processed. Boosting score is a computationally efficient way
120 : /// to acquire preliminary scores of requested items.
121 : std::vector<std::pair<DocID, float>> consume(Iterator &It);
122 :
123 : namespace detail {
124 : // Variadic template machinery.
125 0 : inline void populateChildren(std::vector<std::unique_ptr<Iterator>> &) {}
126 : template <typename... TailT>
127 0 : void populateChildren(std::vector<std::unique_ptr<Iterator>> &Children,
128 : std::unique_ptr<Iterator> Head, TailT... Tail) {
129 : Children.push_back(move(Head));
130 0 : populateChildren(Children, move(Tail)...);
131 0 : }
132 0 : } // namespace detail
133 :
134 : // A corpus is a set of documents, and a factory for iterators over them.
135 0 : class Corpus {
136 0 : DocID Size;
137 0 :
138 : public:
139 : explicit Corpus(DocID Size) : Size(Size) {}
140 0 :
141 0 : /// Returns AND Iterator which performs the intersection of the PostingLists
142 0 : /// of its children.
143 : ///
144 : /// consume(): AND Iterator returns the product of Childrens' boosting
145 0 : /// scores.
146 0 : std::unique_ptr<Iterator>
147 : intersect(std::vector<std::unique_ptr<Iterator>> Children) const;
148 :
149 : /// Returns OR Iterator which performs the union of the PostingLists of its
150 : /// children.
151 : ///
152 : /// consume(): OR Iterator returns the highest boost value among children
153 : /// containing the requested item.
154 0 : std::unique_ptr<Iterator>
155 : unionOf(std::vector<std::unique_ptr<Iterator>> Children) const;
156 :
157 : /// Returns TRUE Iterator which iterates over "virtual" PostingList
158 : /// containing all items in range [0, Size) in an efficient manner.
159 : std::unique_ptr<Iterator> all() const;
160 :
161 : /// Returns FALSE Iterator which iterates over no documents.
162 : std::unique_ptr<Iterator> none() const;
163 :
164 : /// Returns BOOST iterator which multiplies the score of each item by given
165 : /// factor. Boosting can be used as a computationally inexpensive filtering.
166 : /// Users can return significantly more items using consumeAndBoost() and
167 : /// then trim Top K using retrieval score.
168 : std::unique_ptr<Iterator> boost(std::unique_ptr<Iterator> Child,
169 : float Factor) const;
170 :
171 : /// Returns LIMIT iterator, which yields up to N elements of its child
172 : /// iterator. Elements only count towards the limit if they are part of the
173 : /// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2)
174 : /// 1)) yields (2), not ().
175 : std::unique_ptr<Iterator> limit(std::unique_ptr<Iterator> Child,
176 : size_t Limit) const;
177 :
178 : /// This allows intersect(create(...), create(...)) syntax.
179 : template <typename... Args>
180 : std::unique_ptr<Iterator> intersect(Args... args) const {
181 : std::vector<std::unique_ptr<Iterator>> Children;
182 : detail::populateChildren(Children, std::forward<Args>(args)...);
183 : return intersect(move(Children));
184 : }
185 :
186 : /// This allows unionOf(create(...), create(...)) syntax.
187 : template <typename... Args>
188 : std::unique_ptr<Iterator> unionOf(Args... args) const {
189 : std::vector<std::unique_ptr<Iterator>> Children;
190 : detail::populateChildren(Children, std::forward<Args>(args)...);
191 : return unionOf(move(Children));
192 : }
193 : };
194 :
195 0 : } // namespace dex
196 0 : } // namespace clangd
197 0 : } // namespace clang
198 0 :
199 : #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
|