clang-tools  7.0.0
FuzzyMatch.cpp
Go to the documentation of this file.
1 //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
11 // we consider the possible partial match states:
12 //
13 // u n i q u e _ p t r
14 // +---------------------
15 // |A . . . . . . . . . .
16 // u|
17 // |. . . . . . . . . . .
18 // _|
19 // |. . . . . . . O . . .
20 // p|
21 // |. . . . . . . . . . B
22 //
23 // Each dot represents some prefix of the pattern being matched against some
24 // prefix of the word.
25 // - A is the initial state: '' matched against ''
26 // - O is an intermediate state: 'u_' matched against 'unique_'
27 // - B is the target state: 'u_p' matched against 'unique_ptr'
28 //
29 // We aim to find the best path from A->B.
30 // - Moving right (consuming a word character)
31 // Always legal: not all word characters must match.
32 // - Moving diagonally (consuming both a word and pattern character)
33 // Legal if the characters match.
34 // - Moving down (consuming a pattern character) is never legal.
35 // Never legal: all pattern characters must match something.
36 // Characters are matched case-insensitively.
37 // The first pattern character may only match the start of a word segment.
38 //
39 // The scoring is based on heuristics:
40 // - when matching a character, apply a bonus or penalty depending on the
41 // match quality (does case match, do word segments align, etc)
42 // - when skipping a character, apply a penalty if it hurts the match
43 // (it starts a word segment, or splits the matched region, etc)
44 //
45 // These heuristics require the ability to "look backward" one character, to
46 // see whether it was matched or not. Therefore the dynamic-programming matrix
47 // has an extra dimension (last character matched).
48 // Each entry also has an additional flag indicating whether the last-but-one
49 // character matched, which is needed to trace back through the scoring table
50 // and reconstruct the match.
51 //
52 // We treat strings as byte-sequences, so only ASCII has first-class support.
53 //
54 // This algorithm was inspired by VS code's client-side filtering, and aims
55 // to be mostly-compatible.
56 //
57 //===----------------------------------------------------------------------===//
58 
59 #include "FuzzyMatch.h"
60 #include "llvm/ADT/Optional.h"
61 #include "llvm/Support/Format.h"
62 
63 namespace clang {
64 namespace clangd {
65 using namespace llvm;
66 
67 constexpr int FuzzyMatcher::MaxPat;
68 constexpr int FuzzyMatcher::MaxWord;
69 
70 static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
71 // A "negative infinity" score that won't overflow.
72 // We use this to mark unreachable states and forbidden solutions.
73 // Score field is 15 bits wide, min value is -2^14, we use half of that.
74 static constexpr int AwfulScore = -(1 << 13);
75 static bool isAwful(int S) { return S < AwfulScore / 2; }
76 static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
77 
78 FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
79  : PatN(std::min<int>(MaxPat, Pattern.size())),
80  ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
81  std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
82  for (int I = 0; I < PatN; ++I)
83  LowPat[I] = lower(Pat[I]);
84  Scores[0][0][Miss] = {0, Miss};
85  Scores[0][0][Match] = {AwfulScore, Miss};
86  for (int P = 0; P <= PatN; ++P)
87  for (int W = 0; W < P; ++W)
88  for (Action A : {Miss, Match})
89  Scores[P][W][A] = {AwfulScore, Miss};
90  PatTypeSet =
91  calculateRoles(StringRef(Pat, PatN), makeMutableArrayRef(PatRole, PatN));
92 }
93 
94 Optional<float> FuzzyMatcher::match(StringRef Word) {
95  if (!(WordContainsPattern = init(Word)))
96  return None;
97  if (!PatN)
98  return 1;
99  buildGraph();
100  auto Best = std::max(Scores[PatN][WordN][Miss].Score,
101  Scores[PatN][WordN][Match].Score);
102  if (isAwful(Best))
103  return None;
104  float Score =
105  ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
106  // If the pattern is as long as the word, we have an exact string match,
107  // since every pattern character must match something.
108  if (WordN == PatN)
109  Score *= 2; // May not be perfect 2 if case differs in a significant way.
110  return Score;
111 }
112 
113 // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
114 // The top 6 bits of the char select the byte, the bottom 2 select the offset.
115 // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
116 constexpr static uint8_t CharTypes[] = {
117  0x00, 0x00, 0x00, 0x00, // Control characters
118  0x00, 0x00, 0x00, 0x00, // Control characters
119  0xff, 0xff, 0xff, 0xff, // Punctuation
120  0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
121  0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
122  0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
123  0x57, 0x55, 0x55, 0x55, // ` and a-o
124  0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
125  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
126  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
127  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
128  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
129 };
130 
131 // The Role can be determined from the Type of a character and its neighbors:
132 //
133 // Example | Chars | Type | Role
134 // ---------+--------------+-----
135 // F(o)oBar | Foo | Ull | Tail
136 // Foo(B)ar | oBa | lUl | Head
137 // (f)oo | ^fo | Ell | Head
138 // H(T)TP | HTT | UUU | Tail
139 //
140 // Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
141 // A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
142 // e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
143 constexpr static uint8_t CharRoles[] = {
144  // clang-format off
145  // Curr= Empty Lower Upper Separ
146  /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
147  /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
148  /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
149  /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
150  // clang-format on
151 };
152 
153 template <typename T> static T packedLookup(const uint8_t *Data, int I) {
154  return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
155 }
156 CharTypeSet calculateRoles(StringRef Text, MutableArrayRef<CharRole> Roles) {
157  assert(Text.size() == Roles.size());
158  if (Text.size() == 0)
159  return 0;
160  CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
161  CharTypeSet TypeSet = 1 << Type;
162  // Types holds a sliding window of (Prev, Curr, Next) types.
163  // Initial value is (Empty, Empty, type of Text[0]).
164  int Types = Type;
165  // Rotate slides in the type of the next character.
166  auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
167  for (unsigned I = 0; I < Text.size() - 1; ++I) {
168  // For each character, rotate in the next, and look up the role.
169  Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
170  TypeSet |= 1 << Type;
171  Rotate(Type);
172  Roles[I] = packedLookup<CharRole>(CharRoles, Types);
173  }
174  // For the last character, the "next character" is Empty.
175  Rotate(Empty);
176  Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types);
177  return TypeSet;
178 }
179 
180 // Sets up the data structures matching Word.
181 // Returns false if we can cheaply determine that no match is possible.
182 bool FuzzyMatcher::init(StringRef NewWord) {
183  WordN = std::min<int>(MaxWord, NewWord.size());
184  if (PatN > WordN)
185  return false;
186  std::copy(NewWord.begin(), NewWord.begin() + WordN, Word);
187  if (PatN == 0)
188  return true;
189  for (int I = 0; I < WordN; ++I)
190  LowWord[I] = lower(Word[I]);
191 
192  // Cheap subsequence check.
193  for (int W = 0, P = 0; P != PatN; ++W) {
194  if (W == WordN)
195  return false;
196  if (LowWord[W] == LowPat[P])
197  ++P;
198  }
199 
200  // FIXME: some words are hard to tokenize algorithmically.
201  // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
202  // We could add a tokenization dictionary for common stdlib names.
203  WordTypeSet = calculateRoles(StringRef(Word, WordN),
204  makeMutableArrayRef(WordRole, WordN));
205  return true;
206 }
207 
208 // The forwards pass finds the mappings of Pattern onto Word.
209 // Score = best score achieved matching Word[..W] against Pat[..P].
210 // Unlike other tables, indices range from 0 to N *inclusive*
211 // Matched = whether we chose to match Word[W] with Pat[P] or not.
212 //
213 // Points are mostly assigned to matched characters, with 1 being a good score
214 // and 3 being a great one. So we treat the score range as [0, 3 * PatN].
215 // This range is not strict: we can apply larger bonuses/penalties, or penalize
216 // non-matched characters.
217 void FuzzyMatcher::buildGraph() {
218  for (int W = 0; W < WordN; ++W) {
219  Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
220  Miss};
221  Scores[0][W + 1][Match] = {AwfulScore, Miss};
222  }
223  for (int P = 0; P < PatN; ++P) {
224  for (int W = P; W < WordN; ++W) {
225  auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
226 
227  auto MatchMissScore = PreMiss[Match].Score;
228  auto MissMissScore = PreMiss[Miss].Score;
229  if (P < PatN - 1) { // Skipping trailing characters is always free.
230  MatchMissScore -= skipPenalty(W, Match);
231  MissMissScore -= skipPenalty(W, Miss);
232  }
233  Score[Miss] = (MatchMissScore > MissMissScore)
234  ? ScoreInfo{MatchMissScore, Match}
235  : ScoreInfo{MissMissScore, Miss};
236 
237  auto &PreMatch = Scores[P][W];
238  auto MatchMatchScore =
239  allowMatch(P, W, Match)
240  ? PreMatch[Match].Score + matchBonus(P, W, Match)
241  : AwfulScore;
242  auto MissMatchScore = allowMatch(P, W, Miss)
243  ? PreMatch[Miss].Score + matchBonus(P, W, Miss)
244  : AwfulScore;
245  Score[Match] = (MatchMatchScore > MissMatchScore)
246  ? ScoreInfo{MatchMatchScore, Match}
247  : ScoreInfo{MissMatchScore, Miss};
248  }
249  }
250 }
251 
252 bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
253  if (LowPat[P] != LowWord[W])
254  return false;
255  // We require a "strong" match:
256  // - for the first pattern character. [foo] !~ "barefoot"
257  // - after a gap. [pat] !~ "patnther"
258  if (Last == Miss) {
259  // We're banning matches outright, so conservatively accept some other cases
260  // where our segmentation might be wrong:
261  // - allow matching B in ABCDef (but not in NDEBUG)
262  // - we'd like to accept print in sprintf, but too many false positives
263  if (WordRole[W] == Tail &&
264  (Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower)))
265  return false;
266  }
267  return true;
268 }
269 
270 int FuzzyMatcher::skipPenalty(int W, Action Last) const {
271  int S = 0;
272  if (WordRole[W] == Head) // Skipping a segment.
273  S += 1;
274  if (Last == Match) // Non-consecutive match.
275  S += 2; // We'd rather skip a segment than split our match.
276  return S;
277 }
278 
279 int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
280  assert(LowPat[P] == LowWord[W]);
281  int S = 1;
282  // Bonus: pattern so far is a (case-insensitive) prefix of the word.
283  if (P == W) // We can't skip pattern characters, so we must have matched all.
284  ++S;
285  // Bonus: case matches, or a Head in the pattern aligns with one in the word.
286  if ((Pat[P] == Word[W] && ((PatTypeSet & 1 << Upper) || P == W)) ||
287  (PatRole[P] == Head && WordRole[W] == Head))
288  ++S;
289  // Penalty: matching inside a segment (and previous char wasn't matched).
290  if (WordRole[W] == Tail && P && Last == Miss)
291  S -= 3;
292  // Penalty: a Head in the pattern matches in the middle of a word segment.
293  if (PatRole[P] == Head && WordRole[W] == Tail)
294  --S;
295  // Penalty: matching the first pattern character in the middle of a segment.
296  if (P == 0 && WordRole[W] == Tail)
297  S -= 4;
298  assert(S <= PerfectBonus);
299  return S;
300 }
301 
302 llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
303  llvm::SmallString<256> Result;
304  OS << "=== Match \"" << StringRef(Word, WordN) << "\" against ["
305  << StringRef(Pat, PatN) << "] ===\n";
306  if (PatN == 0) {
307  OS << "Pattern is empty: perfect match.\n";
308  return Result = StringRef(Word, WordN);
309  }
310  if (WordN == 0) {
311  OS << "Word is empty: no match.\n";
312  return Result;
313  }
314  if (!WordContainsPattern) {
315  OS << "Substring check failed.\n";
316  return Result;
317  } else if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
318  Scores[PatN][WordN][Miss].Score))) {
319  OS << "Substring check passed, but all matches are forbidden\n";
320  }
321  if (!(PatTypeSet & 1 << Upper))
322  OS << "Lowercase query, so scoring ignores case\n";
323 
324  // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
325  // The Score table has cumulative scores, subtracting along this path gives
326  // us the per-letter scores.
327  Action Last =
328  (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
329  ? Match
330  : Miss;
331  int S[MaxWord];
332  Action A[MaxWord];
333  for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
334  A[W] = Last;
335  const auto &Cell = Scores[P + 1][W + 1][Last];
336  if (Last == Match)
337  --P;
338  const auto &Prev = Scores[P + 1][W][Cell.Prev];
339  S[W] = Cell.Score - Prev.Score;
340  Last = Cell.Prev;
341  }
342  for (int I = 0; I < WordN; ++I) {
343  if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
344  Result.push_back('[');
345  if (A[I] == Miss && I > 0 && A[I - 1] == Match)
346  Result.push_back(']');
347  Result.push_back(Word[I]);
348  }
349  if (A[WordN - 1] == Match)
350  Result.push_back(']');
351 
352  for (char C : StringRef(Word, WordN))
353  OS << " " << C << " ";
354  OS << "\n";
355  for (int I = 0, J = 0; I < WordN; I++)
356  OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
357  OS << "\n";
358  for (int I = 0; I < WordN; I++)
359  OS << format("%2d ", S[I]);
360  OS << "\n";
361 
362  OS << "\nSegmentation:";
363  OS << "\n'" << StringRef(Word, WordN) << "'\n ";
364  for (int I = 0; I < WordN; ++I)
365  OS << "?-+ "[static_cast<int>(WordRole[I])];
366  OS << "\n[" << StringRef(Pat, PatN) << "]\n ";
367  for (int I = 0; I < PatN; ++I)
368  OS << "?-+ "[static_cast<int>(PatRole[I])];
369  OS << "\n";
370 
371  OS << "\nScoring table (last-Miss, last-Match):\n";
372  OS << " | ";
373  for (char C : StringRef(Word, WordN))
374  OS << " " << C << " ";
375  OS << "\n";
376  OS << "-+----" << std::string(WordN * 4, '-') << "\n";
377  for (int I = 0; I <= PatN; ++I) {
378  for (Action A : {Miss, Match}) {
379  OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
380  for (int J = 0; J <= WordN; ++J) {
381  if (!isAwful(Scores[I][J][A].Score))
382  OS << format("%3d%c", Scores[I][J][A].Score,
383  Scores[I][J][A].Prev == Match ? '*' : ' ');
384  else
385  OS << " ";
386  }
387  OS << "\n";
388  }
389  }
390 
391  return Result;
392 }
393 
394 } // namespace clangd
395 } // namespace clang
Some operations such as code completion produce a set of candidates.
static constexpr uint8_t CharTypes[]
Definition: FuzzyMatch.cpp:116
static constexpr uint8_t CharRoles[]
Definition: FuzzyMatch.cpp:143
Documents should not be synced at all.
unsigned char CharTypeSet
Definition: FuzzyMatch.h:50
static bool isAwful(int S)
Definition: FuzzyMatch.cpp:75
FuzzyMatcher(llvm::StringRef Pattern)
Definition: FuzzyMatch.cpp:78
llvm::SmallString< 256 > dumpLast(llvm::raw_ostream &) const
Definition: FuzzyMatch.cpp:302
static char lower(char C)
Definition: FuzzyMatch.cpp:70
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
static constexpr int PerfectBonus
Definition: FuzzyMatch.cpp:76
static constexpr int AwfulScore
Definition: FuzzyMatch.cpp:74
llvm::Optional< float > match(llvm::StringRef Word)
Definition: FuzzyMatch.cpp:94
CharTypeSet calculateRoles(StringRef Text, MutableArrayRef< CharRole > Roles)
Definition: FuzzyMatch.cpp:156
static T packedLookup(const uint8_t *Data, int I)
Definition: FuzzyMatch.cpp:153