blob: b3d58d558595d07d0d981eb1729443ebc382b8a8 [file] [log] [blame]
Sam McCall87496412017-12-01 17:08:02 +00001//===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
11// we consider the possible partial match states:
12//
13// u n i q u e _ p t r
14// +---------------------
15// |A . . . . . . . . . .
16// u|
17// |. . . . . . . . . . .
18// _|
19// |. . . . . . . O . . .
20// p|
21// |. . . . . . . . . . B
22//
23// Each dot represents some prefix of the pattern being matched against some
24// prefix of the word.
25// - A is the initial state: '' matched against ''
26// - O is an intermediate state: 'u_' matched against 'unique_'
27// - B is the target state: 'u_p' matched against 'unique_ptr'
28//
29// We aim to find the best path from A->B.
30// - Moving right (consuming a word character)
31// Always legal: not all word characters must match.
32// - Moving diagonally (consuming both a word and pattern character)
33// Legal if the characters match.
34// - Moving down (consuming a pattern character) is never legal.
35// Never legal: all pattern characters must match something.
Sam McCall77a719c2018-03-05 17:34:33 +000036// Characters are matched case-insensitively.
37// The first pattern character may only match the start of a word segment.
Sam McCall87496412017-12-01 17:08:02 +000038//
39// The scoring is based on heuristics:
40// - when matching a character, apply a bonus or penalty depending on the
41// match quality (does case match, do word segments align, etc)
42// - when skipping a character, apply a penalty if it hurts the match
43// (it starts a word segment, or splits the matched region, etc)
44//
45// These heuristics require the ability to "look backward" one character, to
46// see whether it was matched or not. Therefore the dynamic-programming matrix
47// has an extra dimension (last character matched).
48// Each entry also has an additional flag indicating whether the last-but-one
49// character matched, which is needed to trace back through the scoring table
50// and reconstruct the match.
51//
52// We treat strings as byte-sequences, so only ASCII has first-class support.
53//
54// This algorithm was inspired by VS code's client-side filtering, and aims
55// to be mostly-compatible.
56//
57//===----------------------------------------------------------------------===//
58
59#include "FuzzyMatch.h"
60#include "llvm/ADT/Optional.h"
61#include "llvm/Support/Format.h"
62
Sam McCall8fed6342017-12-01 20:03:19 +000063namespace clang {
64namespace clangd {
Sam McCall87496412017-12-01 17:08:02 +000065using namespace llvm;
Sam McCall87496412017-12-01 17:08:02 +000066
Sam McCalla8c5d3a2017-12-02 02:28:29 +000067constexpr int FuzzyMatcher::MaxPat;
68constexpr int FuzzyMatcher::MaxWord;
Sam McCall87496412017-12-01 17:08:02 +000069
70static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
71// A "negative infinity" score that won't overflow.
72// We use this to mark unreachable states and forbidden solutions.
73// Score field is 15 bits wide, min value is -2^14, we use half of that.
74static constexpr int AwfulScore = -(1 << 13);
75static bool isAwful(int S) { return S < AwfulScore / 2; }
76static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
77
78FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
Sam McCall77a719c2018-03-05 17:34:33 +000079 : PatN(std::min<int>(MaxPat, Pattern.size())),
Sam McCall5668cd32018-01-17 15:25:55 +000080 ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
Sam McCalleb74ab82018-01-19 15:03:49 +000081 std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
Sam McCall77a719c2018-03-05 17:34:33 +000082 for (int I = 0; I < PatN; ++I)
Sam McCall87496412017-12-01 17:08:02 +000083 LowPat[I] = lower(Pat[I]);
Sam McCall87496412017-12-01 17:08:02 +000084 Scores[0][0][Miss] = {0, Miss};
85 Scores[0][0][Match] = {AwfulScore, Miss};
86 for (int P = 0; P <= PatN; ++P)
87 for (int W = 0; W < P; ++W)
88 for (Action A : {Miss, Match})
89 Scores[P][W][A] = {AwfulScore, Miss};
Sam McCall09b4be52018-01-13 16:46:26 +000090 if (PatN > 0)
Sam McCall77a719c2018-03-05 17:34:33 +000091 calculateRoles(Pat, PatRole, PatTypeSet, PatN);
Sam McCall87496412017-12-01 17:08:02 +000092}
93
94Optional<float> FuzzyMatcher::match(StringRef Word) {
Sam McCall87496412017-12-01 17:08:02 +000095 if (!(WordContainsPattern = init(Word)))
96 return None;
Sam McCall09b4be52018-01-13 16:46:26 +000097 if (!PatN)
98 return 1;
Sam McCall87496412017-12-01 17:08:02 +000099 buildGraph();
100 auto Best = std::max(Scores[PatN][WordN][Miss].Score,
101 Scores[PatN][WordN][Match].Score);
102 if (isAwful(Best))
103 return None;
Sam McCallbc7cbb72018-06-06 12:38:37 +0000104 float Score =
105 ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
106 // If the pattern is as long as the word, we have an exact string match,
107 // since every pattern character must match something.
108 if (WordN == PatN)
109 Score *= 2; // May not be perfect 2 if case differs in a significant way.
110 return Score;
Sam McCall87496412017-12-01 17:08:02 +0000111}
112
113// Segmentation of words and patterns.
114// A name like "fooBar_baz" consists of several parts foo, bar, baz.
115// Aligning segmentation of word and pattern improves the fuzzy-match.
116// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
117//
118// First we classify each character into types (uppercase, lowercase, etc).
119// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
120
121// We only distinguish the types of characters that affect segmentation.
122// It's not obvious how to segment digits, we treat them as lowercase letters.
123// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
124// This means we require exact (case-sensitive) match.
Sam McCall8e97cca2017-12-02 03:35:19 +0000125enum FuzzyMatcher::CharType : unsigned char {
Sam McCall87496412017-12-01 17:08:02 +0000126 Empty = 0, // Before-the-start and after-the-end (and control chars).
127 Lower = 1, // Lowercase letters, digits, and non-ASCII bytes.
128 Upper = 2, // Uppercase letters.
129 Punctuation = 3, // ASCII punctuation (including Space)
130};
131
132// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
133// The top 6 bits of the char select the byte, the bottom 2 select the offset.
134// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
135constexpr static uint8_t CharTypes[] = {
136 0x00, 0x00, 0x00, 0x00, // Control characters
137 0x00, 0x00, 0x00, 0x00, // Control characters
138 0xff, 0xff, 0xff, 0xff, // Punctuation
139 0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
140 0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
141 0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
142 0x57, 0x55, 0x55, 0x55, // ` and a-o
143 0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
144 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
145 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
146 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
147 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
148};
149
150// Each character's Role is the Head or Tail of a segment, or a Separator.
151// e.g. XMLHttpRequest_Async
152// +--+---+------ +----
153// ^Head ^Tail ^Separator
Sam McCall8e97cca2017-12-02 03:35:19 +0000154enum FuzzyMatcher::CharRole : unsigned char {
Sam McCall87496412017-12-01 17:08:02 +0000155 Unknown = 0, // Stray control characters or impossible states.
156 Tail = 1, // Part of a word segment, but not the first character.
157 Head = 2, // The first character of a word segment.
158 Separator = 3, // Punctuation characters that separate word segments.
159};
160
161// The Role can be determined from the Type of a character and its neighbors:
162//
163// Example | Chars | Type | Role
164// ---------+--------------+-----
165// F(o)oBar | Foo | Ull | Tail
166// Foo(B)ar | oBa | lUl | Head
167// (f)oo | ^fo | Ell | Head
168// H(T)TP | HTT | UUU | Tail
169//
170// Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
171// A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
172// e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
173constexpr static uint8_t CharRoles[] = {
174 // clang-format off
175 // Curr= Empty Lower Upper Separ
176 /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
177 /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
178 /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
179 /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
180 // clang-format on
181};
182
183template <typename T> static T packedLookup(const uint8_t *Data, int I) {
184 return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
185}
Sam McCall77a719c2018-03-05 17:34:33 +0000186void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int &TypeSet,
187 int N) {
Sam McCall09b4be52018-01-13 16:46:26 +0000188 assert(N > 0);
Sam McCall77a719c2018-03-05 17:34:33 +0000189 CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
190 TypeSet = 1 << Type;
Sam McCall87496412017-12-01 17:08:02 +0000191 // Types holds a sliding window of (Prev, Curr, Next) types.
192 // Initial value is (Empty, Empty, type of Text[0]).
Sam McCall77a719c2018-03-05 17:34:33 +0000193 int Types = Type;
Sam McCall87496412017-12-01 17:08:02 +0000194 // Rotate slides in the type of the next character.
195 auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
196 for (int I = 0; I < N - 1; ++I) {
197 // For each character, rotate in the next, and look up the role.
Sam McCall77a719c2018-03-05 17:34:33 +0000198 Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
199 TypeSet |= 1 << Type;
200 Rotate(Type);
Sam McCall87496412017-12-01 17:08:02 +0000201 *Out++ = packedLookup<CharRole>(CharRoles, Types);
202 }
203 // For the last character, the "next character" is Empty.
204 Rotate(Empty);
205 *Out++ = packedLookup<CharRole>(CharRoles, Types);
206}
207
208// Sets up the data structures matching Word.
209// Returns false if we can cheaply determine that no match is possible.
210bool FuzzyMatcher::init(StringRef NewWord) {
211 WordN = std::min<int>(MaxWord, NewWord.size());
212 if (PatN > WordN)
213 return false;
Sam McCalleb74ab82018-01-19 15:03:49 +0000214 std::copy(NewWord.begin(), NewWord.begin() + WordN, Word);
Sam McCall09b4be52018-01-13 16:46:26 +0000215 if (PatN == 0)
216 return true;
Sam McCall87496412017-12-01 17:08:02 +0000217 for (int I = 0; I < WordN; ++I)
218 LowWord[I] = lower(Word[I]);
219
220 // Cheap subsequence check.
221 for (int W = 0, P = 0; P != PatN; ++W) {
222 if (W == WordN)
223 return false;
224 if (LowWord[W] == LowPat[P])
225 ++P;
226 }
227
Sam McCall77a719c2018-03-05 17:34:33 +0000228 // FIXME: some words are hard to tokenize algorithmically.
229 // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
230 // We could add a tokenization dictionary for common stdlib names.
231 calculateRoles(Word, WordRole, WordTypeSet, WordN);
Sam McCall87496412017-12-01 17:08:02 +0000232 return true;
233}
234
235// The forwards pass finds the mappings of Pattern onto Word.
236// Score = best score achieved matching Word[..W] against Pat[..P].
237// Unlike other tables, indices range from 0 to N *inclusive*
238// Matched = whether we chose to match Word[W] with Pat[P] or not.
239//
240// Points are mostly assigned to matched characters, with 1 being a good score
241// and 3 being a great one. So we treat the score range as [0, 3 * PatN].
242// This range is not strict: we can apply larger bonuses/penalties, or penalize
243// non-matched characters.
244void FuzzyMatcher::buildGraph() {
245 for (int W = 0; W < WordN; ++W) {
246 Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
247 Miss};
248 Scores[0][W + 1][Match] = {AwfulScore, Miss};
249 }
250 for (int P = 0; P < PatN; ++P) {
251 for (int W = P; W < WordN; ++W) {
252 auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
253
254 auto MatchMissScore = PreMiss[Match].Score;
255 auto MissMissScore = PreMiss[Miss].Score;
256 if (P < PatN - 1) { // Skipping trailing characters is always free.
257 MatchMissScore -= skipPenalty(W, Match);
258 MissMissScore -= skipPenalty(W, Miss);
259 }
260 Score[Miss] = (MatchMissScore > MissMissScore)
261 ? ScoreInfo{MatchMissScore, Match}
262 : ScoreInfo{MissMissScore, Miss};
263
Sam McCall8b2dcc12018-06-14 13:50:30 +0000264 auto &PreMatch = Scores[P][W];
265 auto MatchMatchScore =
266 allowMatch(P, W, Match)
267 ? PreMatch[Match].Score + matchBonus(P, W, Match)
268 : AwfulScore;
269 auto MissMatchScore = allowMatch(P, W, Miss)
270 ? PreMatch[Miss].Score + matchBonus(P, W, Miss)
271 : AwfulScore;
272 Score[Match] = (MatchMatchScore > MissMatchScore)
273 ? ScoreInfo{MatchMatchScore, Match}
274 : ScoreInfo{MissMatchScore, Miss};
Sam McCall87496412017-12-01 17:08:02 +0000275 }
276 }
277}
278
Sam McCall8b2dcc12018-06-14 13:50:30 +0000279bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
Sam McCall77a719c2018-03-05 17:34:33 +0000280 if (LowPat[P] != LowWord[W])
281 return false;
Sam McCall8b2dcc12018-06-14 13:50:30 +0000282 // We require a "strong" match:
283 // - for the first pattern character. [foo] !~ "barefoot"
284 // - after a gap. [pat] !~ "patnther"
285 if (Last == Miss) {
286 // We're banning matches outright, so conservatively accept some other cases
287 // where our segmentation might be wrong:
288 // - allow matching B in ABCDef (but not in NDEBUG)
289 // - we'd like to accept print in sprintf, but too many false positives
290 if (WordRole[W] == Tail &&
291 (Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower)))
292 return false;
293 }
294 return true;
Sam McCall77a719c2018-03-05 17:34:33 +0000295}
296
297int FuzzyMatcher::skipPenalty(int W, Action Last) const {
Sam McCall87496412017-12-01 17:08:02 +0000298 int S = 0;
299 if (WordRole[W] == Head) // Skipping a segment.
300 S += 1;
301 if (Last == Match) // Non-consecutive match.
302 S += 2; // We'd rather skip a segment than split our match.
303 return S;
304}
305
Sam McCall77a719c2018-03-05 17:34:33 +0000306int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
Sam McCall87496412017-12-01 17:08:02 +0000307 assert(LowPat[P] == LowWord[W]);
308 int S = 1;
309 // Bonus: pattern so far is a (case-insensitive) prefix of the word.
310 if (P == W) // We can't skip pattern characters, so we must have matched all.
311 ++S;
312 // Bonus: case matches, or a Head in the pattern aligns with one in the word.
Sam McCall77a719c2018-03-05 17:34:33 +0000313 if ((Pat[P] == Word[W] && ((PatTypeSet & 1 << Upper) || P == W)) ||
Sam McCall87496412017-12-01 17:08:02 +0000314 (PatRole[P] == Head && WordRole[W] == Head))
315 ++S;
316 // Penalty: matching inside a segment (and previous char wasn't matched).
317 if (WordRole[W] == Tail && P && Last == Miss)
318 S -= 3;
319 // Penalty: a Head in the pattern matches in the middle of a word segment.
320 if (PatRole[P] == Head && WordRole[W] == Tail)
321 --S;
322 // Penalty: matching the first pattern character in the middle of a segment.
323 if (P == 0 && WordRole[W] == Tail)
324 S -= 4;
325 assert(S <= PerfectBonus);
326 return S;
327}
328
329llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
330 llvm::SmallString<256> Result;
331 OS << "=== Match \"" << StringRef(Word, WordN) << "\" against ["
332 << StringRef(Pat, PatN) << "] ===\n";
Sam McCall09b4be52018-01-13 16:46:26 +0000333 if (PatN == 0) {
334 OS << "Pattern is empty: perfect match.\n";
335 return Result = StringRef(Word, WordN);
336 }
337 if (WordN == 0) {
338 OS << "Word is empty: no match.\n";
339 return Result;
340 }
Sam McCall87496412017-12-01 17:08:02 +0000341 if (!WordContainsPattern) {
342 OS << "Substring check failed.\n";
343 return Result;
344 } else if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
345 Scores[PatN][WordN][Miss].Score))) {
346 OS << "Substring check passed, but all matches are forbidden\n";
347 }
Sam McCall77a719c2018-03-05 17:34:33 +0000348 if (!(PatTypeSet & 1 << Upper))
Sam McCall87496412017-12-01 17:08:02 +0000349 OS << "Lowercase query, so scoring ignores case\n";
350
351 // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
352 // The Score table has cumulative scores, subtracting along this path gives
353 // us the per-letter scores.
354 Action Last =
355 (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
356 ? Match
357 : Miss;
358 int S[MaxWord];
359 Action A[MaxWord];
360 for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
361 A[W] = Last;
362 const auto &Cell = Scores[P + 1][W + 1][Last];
363 if (Last == Match)
364 --P;
365 const auto &Prev = Scores[P + 1][W][Cell.Prev];
366 S[W] = Cell.Score - Prev.Score;
367 Last = Cell.Prev;
368 }
369 for (int I = 0; I < WordN; ++I) {
370 if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
371 Result.push_back('[');
372 if (A[I] == Miss && I > 0 && A[I - 1] == Match)
373 Result.push_back(']');
374 Result.push_back(Word[I]);
375 }
376 if (A[WordN - 1] == Match)
377 Result.push_back(']');
378
379 for (char C : StringRef(Word, WordN))
380 OS << " " << C << " ";
381 OS << "\n";
382 for (int I = 0, J = 0; I < WordN; I++)
383 OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
384 OS << "\n";
385 for (int I = 0; I < WordN; I++)
386 OS << format("%2d ", S[I]);
387 OS << "\n";
388
389 OS << "\nSegmentation:";
390 OS << "\n'" << StringRef(Word, WordN) << "'\n ";
391 for (int I = 0; I < WordN; ++I)
392 OS << "?-+ "[static_cast<int>(WordRole[I])];
393 OS << "\n[" << StringRef(Pat, PatN) << "]\n ";
394 for (int I = 0; I < PatN; ++I)
395 OS << "?-+ "[static_cast<int>(PatRole[I])];
396 OS << "\n";
397
398 OS << "\nScoring table (last-Miss, last-Match):\n";
399 OS << " | ";
400 for (char C : StringRef(Word, WordN))
401 OS << " " << C << " ";
402 OS << "\n";
403 OS << "-+----" << std::string(WordN * 4, '-') << "\n";
404 for (int I = 0; I <= PatN; ++I) {
405 for (Action A : {Miss, Match}) {
406 OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
407 for (int J = 0; J <= WordN; ++J) {
408 if (!isAwful(Scores[I][J][A].Score))
409 OS << format("%3d%c", Scores[I][J][A].Score,
410 Scores[I][J][A].Prev == Match ? '*' : ' ');
411 else
412 OS << " ";
413 }
414 OS << "\n";
415 }
416 }
417
418 return Result;
419}
Sam McCall8fed6342017-12-01 20:03:19 +0000420
421} // namespace clangd
422} // namespace clang